rss2email-2.71.orig/ 0000755 0000000 0000000 00000000000 11534222005 012773 5 ustar root root rss2email-2.71.orig/rss2email.py 0000755 0000000 0000000 00000075170 11534214212 015264 0 ustar root root #!/usr/bin/python """rss2email: get RSS feeds emailed to you http://rss2email.infogami.com Usage: new [emailaddress] (create new feedfile) email newemailaddress (update default email) run [--no-send] [num] add feedurl [emailaddress] list reset delete n pause n unpause n opmlexport opmlimport filename """ __version__ = "2.70" __author__ = "Lindsey Smith (lindsey@allthingsrss.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3." ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", "Matej Cepl", "Martin 'Joey' Schulze", "Marcel Ackermann (http://www.DreamFlasher.de)", "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ] import urllib2 urllib2.install_opener(urllib2.build_opener()) ### Vaguely Customizable Options ### # The email address messages are from by default: DEFAULT_FROM = "bozo@dev.null.invalid" # 1: Send text/html messages when possible. # 0: Convert HTML to plain text. HTML_MAIL = 0 # 1: Only use the DEFAULT_FROM address. # 0: Use the email address specified by the feed, when possible. FORCE_FROM = 0 # 1: Receive one email per post. # 0: Receive an email every time a post changes. TRUST_GUID = 1 # 1: Generate Date header based on item's date, when possible. # 0: Generate Date header based on time sent. DATE_HEADER = 0 # A tuple consisting of some combination of # ('issued', 'created', 'modified', 'expired') # expressing ordered list of preference in dates # to use for the Date header of the email. DATE_HEADER_ORDER = ('modified', 'issued', 'created') # 1: Apply Q-P conversion (required for some MUAs). # 0: Send message in 8-bits. # http://cr.yp.to/smtp/8bitmime.html #DEPRECATED QP_REQUIRED = 0 #DEPRECATED # 1: Name feeds as they're being processed. # 0: Keep quiet. VERBOSE = 0 # 1: Use the publisher's email if you can't find the author's. # 0: Just use the DEFAULT_FROM email instead. USE_PUBLISHER_EMAIL = 0 # 1: Use SMTP_SERVER to send mail. # 0: Call /usr/sbin/sendmail to send mail. SMTP_SEND = 0 SMTP_SERVER = "smtp.yourisp.net:25" AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1 SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here # Connect to the SMTP server using SSL SMTP_SSL = 0 # Set this to add a bonus header to all emails (start with '\n'). BONUS_HEADER = '' # Example: BONUS_HEADER = '\nApproved: joe@bob.org' # Set this to override From addresses. Keys are feed URLs, values are new titles. OVERRIDE_FROM = {} # Set this to override From email addresses. Keys are feed URLs, values are new emails. OVERRIDE_EMAIL = {} # Set this to default From email addresses. Keys are feed URLs, values are new email addresses. DEFAULT_EMAIL = {} # Only use the email from address rather than friendly name plus email address NO_FRIENDLY_NAME = 0 # Set this to override the timeout (in seconds) for feed server response FEED_TIMEOUT = 60 # Optional CSS styling USE_CSS_STYLING = 0 STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; } .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }' # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/' PROXY="" # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8' from email.MIMEText import MIMEText from email.Header import Header from email.Utils import parseaddr, formataddr # Note: You can also override the send function. def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None): """Send an email. All arguments should be Unicode strings (plain ASCII works as well). Only the real name part of sender and recipient addresses may contain non-ASCII characters. The email will be properly MIME encoded and delivered though SMTP to localhost port 25. This is easy to change if you want something different. The charset of the email will be the first one out of the list that can represent all the characters occurring in the email. """ # Header class is smart enough to try US-ASCII, then the charset we # provide, then fall back to UTF-8. header_charset = 'ISO-8859-1' # We must choose the body charset manually for body_charset in CHARSET_LIST: try: body.encode(body_charset) except (UnicodeError, LookupError): pass else: break # Split real name (which is optional) and email address parts sender_name, sender_addr = parseaddr(sender) recipient_name, recipient_addr = parseaddr(recipient) # We must always pass Unicode strings to Header, otherwise it will # use RFC 2047 encoding even on plain ASCII strings. sender_name = str(Header(unicode(sender_name), header_charset)) recipient_name = str(Header(unicode(recipient_name), header_charset)) # Make sure email addresses do not contain non-ASCII characters sender_addr = sender_addr.encode('ascii') recipient_addr = recipient_addr.encode('ascii') # Create the message ('plain' stands for Content-Type: text/plain) msg = MIMEText(body.encode(body_charset), contenttype, body_charset) msg['To'] = formataddr((recipient_name, recipient_addr)) msg['Subject'] = Header(unicode(subject), header_charset) for hdr in extraheaders.keys(): try: msg[hdr] = Header(unicode(extraheaders[hdr], header_charset)) except: msg[hdr] = Header(extraheaders[hdr]) fromhdr = formataddr((sender_name, sender_addr)) msg['From'] = fromhdr msg_as_string = msg.as_string() #DEPRECATED if QP_REQUIRED: #DEPRECATED ins, outs = SIO(msg_as_string), SIO() #DEPRECATED mimify.mimify(ins, outs) #DEPRECATED msg_as_string = outs.getvalue() if SMTP_SEND: if not smtpserver: import smtplib try: if SMTP_SSL: smtpserver = smtplib.SMTP_SSL() else: smtpserver = smtplib.SMTP() smtpserver.connect(SMTP_SERVER) except KeyboardInterrupt: raise except Exception, e: print >>warn, "" print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER) print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly') if hasattr(e, 'reason'): print >>warn, "Reason:", e.reason sys.exit(1) if AUTHREQUIRED: try: smtpserver.ehlo() if not SMTP_SSL: smtpserver.starttls() smtpserver.ehlo() smtpserver.login(SMTP_USER, SMTP_PASS) except KeyboardInterrupt: raise except Exception, e: print >>warn, "" print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER)) print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly') if hasattr(e, 'reason'): print >>warn, "Reason:", e.reason sys.exit(1) smtpserver.sendmail(sender, recipient, msg_as_string) return smtpserver else: try: p = subprocess.Popen(["/usr/sbin/sendmail", recipient], stdin=subprocess.PIPE, stdout=subprocess.PIPE) p.communicate(msg_as_string) status = p.returncode assert status != None, "just a sanity check" if status != 0: print >>warn, "" print >>warn, ('Fatal error: sendmail exited with code %s' % status) sys.exit(1) except: print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email: # 1: Use SMTP_SERVER to send mail. # 0: Call /usr/sbin/sendmail to send mail. SMTP_SEND = 0 SMTP_SERVER = "smtp.yourisp.net:25" AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1 SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here ''' sys.exit(1) return None ## html2text options ## # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) BODY_WIDTH = 0 ### Load the Options ### # Read options from config file if present. import sys sys.path.insert(0,".") try: from config import * except: pass warn = sys.stderr if QP_REQUIRED: print >>warn, "QP_REQUIRED has been deprecated in rss2email." ### Import Modules ### import cPickle as pickle, time, os, traceback, sys, types, subprocess hash = () try: import hashlib hash = hashlib.md5 except ImportError: import md5 hash = md5.new unix = 0 try: import fcntl # A pox on SunOS file locking methods if (sys.platform.find('sunos') == -1): unix = 1 except: pass import socket; socket_errors = [] for e in ['error', 'gaierror']: if hasattr(socket, e): socket_errors.append(getattr(socket, e)) #DEPRECATED import mimify #DEPRECATED from StringIO import StringIO as SIO #DEPRECATED mimify.CHARSET = 'utf-8' import feedparser feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.allthingsrss.com/rss2email/" import html2text as h2t h2t.UNICODE_SNOB = UNICODE_SNOB h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH h2t.BODY_WIDTH = BODY_WIDTH html2text = h2t.html2text from types import * ### Utility Functions ### import threading class TimeoutError(Exception): pass class InputError(Exception): pass def timelimit(timeout, function): # def internal(function): def internal2(*args, **kw): """ from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878 """ class Calculator(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.result = None self.error = None def run(self): try: self.result = function(*args, **kw) except: self.error = sys.exc_info() c = Calculator() c.setDaemon(True) # don't hold up exiting c.start() c.join(timeout) if c.isAlive(): raise TimeoutError if c.error: raise c.error[0], c.error[1] return c.result return internal2 # return internal def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) def ishtml(t): return type(t) is type(()) def contains(a,b): return a.find(b) != -1 def unu(s): # I / freakin' hate / that unicode if type(s) is types.UnicodeType: return s.encode('utf-8') else: return s ### Parsing Utilities ### def getContent(entry, HTMLOK=0): """Select the best content from an entry, deHTMLizing if necessary. If raw HTML is best, an ('HTML', best) tuple is returned. """ # How this works: # * We have a bunch of potential contents. # * We go thru looking for our first choice. # (HTML or text, depending on HTMLOK) # * If that doesn't work, we go thru looking for our second choice. # * If that still doesn't work, we just take the first one. # # Possible future improvement: # * Instead of just taking the first one # pick the one in the "best" language. # * HACK: hardcoded HTMLOK, should take a tuple of media types conts = entry.get('content', []) if entry.get('summary_detail', {}): conts += [entry.summary_detail] if conts: if HTMLOK: for c in conts: if contains(c.type, 'html'): return ('HTML', c.value) if not HTMLOK: # Only need to convert to text if HTML isn't OK for c in conts: if contains(c.type, 'html'): return html2text(c.value) for c in conts: if c.type == 'text/plain': return c.value return conts[0].value return "" def getID(entry): """Get best ID from an entry.""" if TRUST_GUID: if 'id' in entry and entry.id: # Newer versions of feedparser could return a dictionary if type(entry.id) is DictType: return entry.id.values()[0] return entry.id content = getContent(entry) if content and content != "\n": return hash(unu(content)).hexdigest() if 'link' in entry: return entry.link if 'title' in entry: return hash(unu(entry.title)).hexdigest() def getName(r, entry): """Get the best name.""" if NO_FRIENDLY_NAME: return '' feed = r.feed if hasattr(r, "url") and r.url in OVERRIDE_FROM.keys(): return OVERRIDE_FROM[r.url] name = feed.get('title', '') if 'name' in entry.get('author_detail', []): # normally {} but py2.1 if entry.author_detail.name: if name: name += ": " det=entry.author_detail.name try: name += entry.author_detail.name except UnicodeDecodeError: name += unicode(entry.author_detail.name, 'utf-8') elif 'name' in feed.get('author_detail', []): if feed.author_detail.name: if name: name += ", " name += feed.author_detail.name return name def validateEmail(email, planb): """Do a basic quality check on email address, but return planb if email doesn't appear to be well-formed""" email_parts = email.split('@') if len(email_parts) != 2: return planb return email def getEmail(r, entry): """Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead""" feed = r.feed if FORCE_FROM: return DEFAULT_FROM if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys(): return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM) if 'email' in entry.get('author_detail', []): return validateEmail(entry.author_detail.email, DEFAULT_FROM) if 'email' in feed.get('author_detail', []): return validateEmail(feed.author_detail.email, DEFAULT_FROM) if USE_PUBLISHER_EMAIL: if 'email' in feed.get('publisher_detail', []): return validateEmail(feed.publisher_detail.email, DEFAULT_FROM) if feed.get("errorreportsto", ''): return validateEmail(feed.errorreportsto, DEFAULT_FROM) if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys(): return DEFAULT_EMAIL[r.url] return DEFAULT_FROM ### Simple Database of Feeds ### class Feed: def __init__(self, url, to): self.url, self.etag, self.modified, self.seen = url, None, None, {} self.active = True self.to = to def load(lock=1): if not os.path.exists(feedfile): print 'Feedfile "%s" does not exist. If you\'re using r2e for the first time, you' % feedfile print "have to run 'r2e new' first." sys.exit(1) try: feedfileObject = open(feedfile, 'r') except IOError, e: print "Feedfile could not be opened: %s" % e sys.exit(1) feeds = pickle.load(feedfileObject) if lock: locktype = 0 if unix: locktype = fcntl.LOCK_EX fcntl.flock(feedfileObject.fileno(), locktype) #HACK: to deal with lock caching feedfileObject = open(feedfile, 'r') feeds = pickle.load(feedfileObject) if unix: fcntl.flock(feedfileObject.fileno(), locktype) if feeds: for feed in feeds[1:]: if not hasattr(feed, 'active'): feed.active = True return feeds, feedfileObject def unlock(feeds, feedfileObject): if not unix: pickle.dump(feeds, open(feedfile, 'w')) else: fd = open(feedfile+'.tmp', 'w') pickle.dump(feeds, fd) fd.flush() os.fsync(fd.fileno()) fd.close() os.rename(feedfile+'.tmp', feedfile) fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN) #@timelimit(FEED_TIMEOUT) def parse(url, etag, modified): if PROXY == '': return feedparser.parse(url, etag, modified) else: proxy = urllib2.ProxyHandler( {"http":PROXY} ) return feedparser.parse(url, etag, modified, handlers = [proxy]) ### Program Functions ### def add(*args): if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'): urls, to = [args[0]], args[1] else: urls, to = args, None feeds, feedfileObject = load() if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None): print "No email address has been defined. Please run 'r2e email emailaddress' or" print "'r2e add url emailaddress'." sys.exit(1) for url in urls: feeds.append(Feed(url, to)) unlock(feeds, feedfileObject) def run(num=None): feeds, feedfileObject = load() smtpserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "\n" content += '
\n' content += '\n' content += '\n' + body + ' |
URL: '+link+'
' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: '+enclosure.url+", tag='pre', attrs=[('class', 'screen')] if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag) uattrs = [] strattrs='' if attrs: for key, value in attrs: value=value.replace('>','>').replace('<','<').replace('"','"') value = self.bare_ampersand.sub("&", value) # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds if type(value) != type(u''): try: value = unicode(value, self.encoding) except: value = unicode(value, 'iso-8859-1') try: # Currently, in Python 3 the key is already a str, and cannot be decoded again uattrs.append((unicode(key, self.encoding), value)) except TypeError: uattrs.append((key, value)) strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) if self.encoding: try: strattrs=strattrs.encode(self.encoding) except: pass if tag in self.elements_no_end_tag: self.pieces.append('<%(tag)s%(strattrs)s />' % locals()) else: self.pieces.append('<%(tag)s%(strattrs)s>' % locals()) def unknown_endtag(self, tag): # called for each end tag, e.g. for, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: self.pieces.append("%(tag)s>" % locals()) def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. if ref.startswith('x'): value = unichr(int(ref[1:],16)) else: value = unichr(int(ref)) if value in _cp1252.keys(): self.pieces.append('%s;' % hex(ord(_cp1252[value]))[1:]) else: self.pieces.append('%(ref)s;' % locals()) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if name2codepoint.has_key(ref): self.pieces.append('&%(ref)s;' % locals()) else: self.pieces.append('&%(ref)s' % locals()) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text) self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. self.pieces.append('' % locals()) def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. self.pieces.append('%(text)s>' % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE self.pieces.append('' % locals()) _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return None, -1 m = self._new_declname_match(rawdata, i) if m: s = m.group() name = s.strip() if (i + len(s)) == n: return None, -1 # end of buffer return name.lower(), m.end() else: self.handle_data(rawdata) # self.updatepos(declstartpos, i) return None, -1 def convert_charref(self, name): return '%s;' % name def convert_entityref(self, name): return '&%s;' % name def output(self): '''Return processed HTML as a single string''' return ''.join([str(p) for p in self.pieces]) def parse_declaration(self, i): try: return sgmllib.SGMLParser.parse_declaration(self, i) except sgmllib.SGMLParseError: # escape the doctype declaration and continue parsing self.handle_data('<') return i+1 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): def __init__(self, baseuri, baselang, encoding, entities): sgmllib.SGMLParser.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml') self.entities=entities def decodeEntities(self, element, data): data = data.replace('<', '<') data = data.replace('<', '<') data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('>', '>') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace('"', '"') data = data.replace(''', ''') data = data.replace(''', ''') if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace(''', "'") return data def strattrs(self, attrs): return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) class _MicroformatsParser: STRING = 1 DATE = 2 URI = 3 NODE = 4 EMAIL = 5 known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] def __init__(self, data, baseuri, encoding): self.document = BeautifulSoup.BeautifulSoup(data) self.baseuri = baseuri self.encoding = encoding if type(data) == type(u''): data = data.encode(encoding) self.tags = [] self.enclosures = [] self.xfn = [] self.vcard = None def vcardEscape(self, s): if type(s) in (type(''), type(u'')): s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') return s def vcardFold(self, s): s = re.sub(';+$', '', s) sFolded = '' iMax = 75 sPrefix = '' while len(s) > iMax: sFolded += sPrefix + s[:iMax] + '\n' s = s[iMax:] sPrefix = ' ' iMax = 74 sFolded += sPrefix + s return sFolded def normalize(self, s): return re.sub(r'\s+', ' ', s).strip() def unique(self, aList): results = [] for element in aList: if element not in results: results.append(element) return results def toISO8601(self, dt): return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): all = lambda x: 1 sProperty = sProperty.lower() bFound = 0 bNormalize = 1 propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} if bAllowMultiple and (iPropertyType != self.NODE): snapResults = [] containers = elmRoot(['ul', 'ol'], propertyMatch) for container in containers: snapResults.extend(container('li')) bFound = (len(snapResults) != 0) if not bFound: snapResults = elmRoot(all, propertyMatch) bFound = (len(snapResults) != 0) if (not bFound) and (sProperty == 'value'): snapResults = elmRoot('pre') bFound = (len(snapResults) != 0) bNormalize = not bFound if not bFound: snapResults = [elmRoot] bFound = (len(snapResults) != 0) arFilter = [] if sProperty == 'vcard': snapFilter = elmRoot(all, propertyMatch) for node in snapFilter: if node.findParent(all, propertyMatch): arFilter.append(node) arResults = [] for node in snapResults: if node not in arFilter: arResults.append(node) bFound = (len(arResults) != 0) if not bFound: if bAllowMultiple: return [] elif iPropertyType == self.STRING: return '' elif iPropertyType == self.DATE: return None elif iPropertyType == self.URI: return '' elif iPropertyType == self.NODE: return None else: return None arValues = [] for elmResult in arResults: sValue = None if iPropertyType == self.NODE: if bAllowMultiple: arValues.append(elmResult) continue else: return elmResult sNodeName = elmResult.name.lower() if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] if sValue: sValue = bNormalize and self.normalize(sValue) or sValue.strip() if (not sValue) and (sNodeName == 'abbr'): sValue = elmResult.get('title') if sValue: sValue = bNormalize and self.normalize(sValue) or sValue.strip() if (not sValue) and (iPropertyType == self.URI): if sNodeName == 'a': sValue = elmResult.get('href') elif sNodeName == 'img': sValue = elmResult.get('src') elif sNodeName == 'object': sValue = elmResult.get('data') if sValue: sValue = bNormalize and self.normalize(sValue) or sValue.strip() if (not sValue) and (sNodeName == 'img'): sValue = elmResult.get('alt') if sValue: sValue = bNormalize and self.normalize(sValue) or sValue.strip() if not sValue: sValue = elmResult.renderContents() sValue = re.sub(r'<\S[^>]*>', '', sValue) sValue = sValue.replace('\r\n', '\n') sValue = sValue.replace('\r', '\n') if sValue: sValue = bNormalize and self.normalize(sValue) or sValue.strip() if not sValue: continue if iPropertyType == self.DATE: sValue = _parse_date_iso8601(sValue) if bAllowMultiple: arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) else: return bAutoEscape and self.vcardEscape(sValue) or sValue return arValues def findVCards(self, elmRoot, bAgentParsing=0): sVCards = '' if not bAgentParsing: arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) else: arCards = [elmRoot] for elmCard in arCards: arLines = [] def processSingleString(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) return sValue or u'' def processSingleURI(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.URI) if sValue: sContentType = '' sEncoding = '' sValueKey = '' if sValue.startswith('data:'): sEncoding = ';ENCODING=b' sContentType = sValue.split(';')[0].split('/').pop() sValue = sValue.split(',', 1).pop() else: elmValue = self.getPropertyValue(elmCard, sProperty) if elmValue: if sProperty != 'url': sValueKey = ';VALUE=uri' sContentType = elmValue.get('type', '').strip().split('/').pop().strip() sContentType = sContentType.upper() if sContentType == 'OCTET-STREAM': sContentType = '' if sContentType: sContentType = ';TYPE=' + sContentType.upper() arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) def processTypeValue(sProperty, arDefaultType, arForceType=None): arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) for elmResult in arResults: arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) if arForceType: arType = self.unique(arForceType + arType) if not arType: arType = arDefaultType sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) # AGENT # must do this before all other properties because it is destructive # (removes nested class="vcard" nodes so they don't interfere with # this vcard's other properties) arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) for elmAgent in arAgent: if re.compile(r'\bvcard\b').search(elmAgent.get('class')): sAgentValue = self.findVCards(elmAgent, 1) + '\n' sAgentValue = sAgentValue.replace('\n', '\\n') sAgentValue = sAgentValue.replace(';', '\\;') if sAgentValue: arLines.append(self.vcardFold('AGENT:' + sAgentValue)) # Completely remove the agent element from the parse tree elmAgent.extract() else: sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); if sAgentValue: arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) # FN (full name) sFN = processSingleString('fn') # N (name) elmName = self.getPropertyValue(elmCard, 'n') if elmName: sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) arLines.append(self.vcardFold('N:' + sFamilyName + ';' + sGivenName + ';' + ','.join(arAdditionalNames) + ';' + ','.join(arHonorificPrefixes) + ';' + ','.join(arHonorificSuffixes))) elif sFN: # implied "N" optimization # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization arNames = self.normalize(sFN).split() if len(arNames) == 2: bFamilyNameFirst = (arNames[0].endswith(',') or len(arNames[1]) == 1 or ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) if bFamilyNameFirst: arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) else: arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) # SORT-STRING sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) if sSortString: arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) # NICKNAME arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) if arNickname: arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) # PHOTO processSingleURI('photo') # BDAY dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) if dtBday: arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) # ADR (address) arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) for elmAdr in arAdr: arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) if not arType: arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + sPostOfficeBox + ';' + sExtendedAddress + ';' + sStreetAddress + ';' + sLocality + ';' + sRegion + ';' + sPostalCode + ';' + sCountryName)) # LABEL processTypeValue('label', ['intl','postal','parcel','work']) # TEL (phone number) processTypeValue('tel', ['voice']) # EMAIL processTypeValue('email', ['internet'], ['internet']) # MAILER processSingleString('mailer') # TZ (timezone) processSingleString('tz') # GEO (geographical information) elmGeo = self.getPropertyValue(elmCard, 'geo') if elmGeo: sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) # TITLE processSingleString('title') # ROLE processSingleString('role') # LOGO processSingleURI('logo') # ORG (organization) elmOrg = self.getPropertyValue(elmCard, 'org') if elmOrg: sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) if not sOrganizationName: # implied "organization-name" optimization # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) if sOrganizationName: arLines.append(self.vcardFold('ORG:' + sOrganizationName)) else: arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) # CATEGORY arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) if arCategory: arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) # NOTE processSingleString('note') # REV processSingleString('rev') # SOUND processSingleURI('sound') # UID processSingleString('uid') # URL processSingleURI('url') # CLASS processSingleString('class') # KEY processSingleURI('key') if arLines: arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] sVCards += u'\n'.join(arLines) + u'\n' return sVCards.strip() def isProbablyDownloadable(self, elm): attrsD = elm.attrMap if not attrsD.has_key('href'): return 0 linktype = attrsD.get('type', '').strip() if linktype.startswith('audio/') or \ linktype.startswith('video/') or \ (linktype.startswith('application/') and not linktype.endswith('xml')): return 1 path = urlparse.urlparse(attrsD['href'])[2] if path.find('.') == -1: return 0 fileext = path.split('.').pop().lower() return fileext in self.known_binary_extensions def findTags(self): all = lambda x: 1 for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): href = elm.get('href') if not href: continue urlscheme, domain, path, params, query, fragment = \ urlparse.urlparse(_urljoin(self.baseuri, href)) segments = path.split('/') tag = segments.pop() if not tag: tag = segments.pop() tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) if not tagscheme.endswith('/'): tagscheme += '/' self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) def findEnclosures(self): all = lambda x: 1 enclosure_match = re.compile(r'\benclosure\b') for elm in self.document(all, {'href': re.compile(r'.+')}): if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue if elm.attrMap not in self.enclosures: self.enclosures.append(elm.attrMap) if elm.string and not elm.get('title'): self.enclosures[-1]['title'] = elm.string def findXFN(self): all = lambda x: 1 for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): rels = elm.get('rel', '').split() xfn_rels = [] for rel in rels: if rel in self.known_xfn_relationships: xfn_rels.append(rel) if xfn_rels: self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) def _parseMicroformats(htmlSource, baseURI, encoding): if not BeautifulSoup: return if _debug: sys.stderr.write('entering _parseMicroformats\n') try: p = _MicroformatsParser(htmlSource, baseURI, encoding) except UnicodeEncodeError: # sgmllib throws this exception when performing lookups of tags # with non-ASCII characters in them. return p.vcard = p.findVCards(p.document) p.findTags() p.findEnclosures() p.findXFN() return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} class _RelativeURIResolver(_BaseHTMLProcessor): relative_uris = [('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'), ('iframe', 'longdesc'), ('iframe', 'src'), ('head', 'profile'), ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'), ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'), ('object', 'classid'), ('object', 'codebase'), ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), ('script', 'src')] def __init__(self, baseuri, encoding, _type): _BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolveURI(self, uri): return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) def unknown_starttag(self, tag, attrs): if _debug: sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs))) attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') p = _RelativeURIResolver(baseURI, encoding, _type) p.feed(htmlSource) return p.output() def _makeSafeAbsoluteURI(base, rel=None): # bail if ACCEPTABLE_URI_SCHEMES is empty if not ACCEPTABLE_URI_SCHEMES: return _urljoin(base, rel or u'') if not base: return rel or u'' if not rel: scheme = urlparse.urlparse(base)[0] if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: return base return u'' uri = _urljoin(base, rel) if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: return u'' return uri class _HTMLSanitizer(_BaseHTMLProcessor): acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', 'background', 'balance', 'bgcolor', 'bgproperties', 'border', 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', 'xml:lang'] unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] acceptable_css_properties = ['azimuth', 'background-color', 'border-bottom-color', 'border-collapse', 'border-color', 'border-left-color', 'border-right-color', 'border-top-color', 'clear', 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', 'white-space', 'width'] # survey of common keywords found in feeds acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', 'transparent', 'underline', 'white', 'yellow'] valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'none', 'semantics'] mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] # svgtiny - foreignObject + linearGradient + radialGradient + stop svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] # svgtiny + class + opacity + offset + xmlns + xmlns:xlink svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', 'arabic-form', 'ascent', 'attributeName', 'attributeType', 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', 'min', 'name', 'offset', 'opacity', 'orient', 'origin', 'overline-position', 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity', 'strikethrough-position', 'strikethrough-thickness', 'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', 'underline-position', 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] svg_attr_map = None svg_elem_map = None acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', 'stroke-opacity'] def reset(self): _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 def unknown_starttag(self, tag, attrs): acceptable_attributes = self.acceptable_attributes keymap = {} if not tag in self.acceptable_elements or self.svgOK: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 # add implicit namespaces to html5 inline svg/mathml if self._type.endswith('html'): if not dict(attrs).get('xmlns'): if tag=='svg': attrs.append( ('xmlns','http://www.w3.org/2000/svg') ) if tag=='math': attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') ) # not otherwise acceptable, perhaps it is MathML or SVG? if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs: self.mathmlOK += 1 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs: self.svgOK += 1 # chose acceptable attributes based on tag class, else bail if self.mathmlOK and tag in self.mathml_elements: acceptable_attributes = self.mathml_attributes elif self.svgOK and tag in self.svg_elements: # for most vocabularies, lowercasing is a good idea. Many # svg elements, however, are camel case if not self.svg_attr_map: lower=[attr.lower() for attr in self.svg_attributes] mix=[a for a in self.svg_attributes if a not in lower] self.svg_attributes = lower self.svg_attr_map = dict([(a.lower(),a) for a in mix]) lower=[attr.lower() for attr in self.svg_elements] mix=[a for a in self.svg_elements if a not in lower] self.svg_elements = lower self.svg_elem_map = dict([(a.lower(),a) for a in mix]) acceptable_attributes = self.svg_attributes tag = self.svg_elem_map.get(tag,tag) keymap = self.svg_attr_map elif not tag in self.acceptable_elements: return # declare xlink namespace, if needed if self.mathmlOK or self.svgOK: if filter(lambda (n,v): n.startswith('xlink:'),attrs): if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) clean_attrs = [] for key, value in self.normalize_attrs(attrs): if key in acceptable_attributes: key=keymap.get(key,key) # make sure the uri uses an acceptable uri scheme if key == u'href': value = _makeSafeAbsoluteURI(value) clean_attrs.append((key,value)) elif key=='style': clean_value = self.sanitize_style(value) if clean_value: clean_attrs.append((key,clean_value)) _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) def unknown_endtag(self, tag): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 if self.mathmlOK and tag in self.mathml_elements: if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 elif self.svgOK and tag in self.svg_elements: tag = self.svg_elem_map.get(tag,tag) if tag == 'svg' and self.svgOK: self.svgOK -= 1 else: return _BaseHTMLProcessor.unknown_endtag(self, tag) def handle_pi(self, text): pass def handle_decl(self, text): pass def handle_data(self, text): if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) def sanitize_style(self, style): # disallow urls style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) # gauntlet if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' # This replaced a regexp that used re.match and was prone to pathological back-tracking. if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return '' clean = [] for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): if not value: continue if prop.lower() in self.acceptable_css_properties: clean.append(prop + ': ' + value + ';') elif prop.split('-')[0].lower() in ['background','border','margin','padding']: for keyword in value.split(): if not keyword in self.acceptable_css_keywords and \ not self.valid_css_values.match(keyword): break else: clean.append(prop + ': ' + value + ';') elif self.svgOK and prop.lower() in self.acceptable_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean) def parse_comment(self, i, report=1): ret = _BaseHTMLProcessor.parse_comment(self, i, report) if ret >= 0: return ret # if ret == -1, this may be a malicious attempt to circumvent # sanitization, or a page-destroying unclosed comment match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) if match: return match.end() # unclosed comment; deliberately fail to handle_data() return len(self.rawdata) def _sanitizeHTML(htmlSource, encoding, _type): p = _HTMLSanitizer(encoding, _type) htmlSource = htmlSource.replace(''): data = data.split('>', 1)[1] if data.count('= '2.3.3' assert base64 != None user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':') realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] self.add_password(realm, host, user, passw) retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) self.reset_retry_count() return retry except: return self.http_error_default(req, fp, code, msg, headers) def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): """URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. If the etag argument is supplied, it will be used as the value of an If-None-Match request header. If the modified argument is supplied, it can be a tuple of 9 integers (as returned by gmtime() in the standard Python time module) or a date string in any format supported by feedparser. Regardless, it MUST be in GMT (Greenwich Mean Time). It will be reformatted into an RFC 1123-compliant date and used as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. If the referrer argument is supplied, it will be used as the value of a Referer[sic] request header. If handlers is supplied, it is a list of handlers used to build a urllib2 opener. if request_headers is supplied it is a dictionary of HTTP request headers that will override the values generated by FeedParser. """ if hasattr(url_file_stream_or_string, 'read'): return url_file_stream_or_string if url_file_stream_or_string == '-': return sys.stdin if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): # Deal with the feed URI scheme if url_file_stream_or_string.startswith('feed:http'): url_file_stream_or_string = url_file_stream_or_string[5:] elif url_file_stream_or_string.startswith('feed:'): url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] if not agent: agent = USER_AGENT # test for inline user:password for basic auth auth = None if base64: urltype, rest = urllib.splittype(url_file_stream_or_string) realhost, rest = urllib.splithost(rest) if realhost: user_passwd, realhost = urllib.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) auth = base64.standard_b64encode(user_passwd).strip() # iri support try: if isinstance(url_file_stream_or_string,unicode): url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8') else: url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8') except: pass # try to open with urllib2 (to use optional headers) request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()])) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) finally: opener.close() # JohnD # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string, 'rb') except: pass # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): request = urllib2.Request(url) request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) if type(modified) == type(''): modified = _parse_date(modified) elif isinstance(modified, datetime.datetime): modified = modified.utctimetuple() if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) if referrer: request.add_header('Referer', referrer) if gzip and zlib: request.add_header('Accept-encoding', 'gzip, deflate') elif gzip: request.add_header('Accept-encoding', 'gzip') elif zlib: request.add_header('Accept-encoding', 'deflate') else: request.add_header('Accept-encoding', '') if auth: request.add_header('Authorization', 'Basic %s' % auth) if ACCEPT_HEADER: request.add_header('Accept', ACCEPT_HEADER) # use this for whatever -- cookies, special headers, etc # [('Cookie','Something'),('x-special-header','Another Value')] for header_name, header_value in request_headers.items(): request.add_header(header_name, header_value) request.add_header('A-IM', 'feed') # RFC 3229 support return request _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' _date_handlers.insert(0, func) # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition # to the Python library. # A single regular expression cannot parse ISO 8601 date formats into groups # as the standard is highly irregular (for instance is 030104 2003-01-04 or # 0301-04-01), so we use templates instead. # Please note the order in templates is significant because we need a # greedy match. _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', '-YY-?MM', '-OOO', '-YY', '--MM-?DD', '--MM', '---DD', 'CC', ''] _iso8601_re = [ tmpl.replace( 'YYYY', r'(?P
We highly recommend that you subscribe to the rss2email project feed so you can keep up to date with the latest version, bugfixes and features: http://feeds.feedburner.com/allthingsrss/hJBr
Instructions for Windows Users
Instructions for UNIX Users
Customizing rss2email
Before you install rss2email, you'll need to make sure that a few things are in place. First, is that a version of Python 2.x installed. Second, determine your outgoing email server's address. That should be all you need.
Edit the config.py
file and fill in your outoing email server's details. If your server requires you to login, change "AUTHREQUIRED = 0"
to "AUTHREQUIRED = 1"
and enter your email username and password.
From the command line, change to the folder you created. Now create a new feed database to send updates to your email address:
r2e new you@yourdomain.com
Subscribe to some feeds:
r2e add http://feeds.feedburner.com/allthingsrss/hJBr
That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.
When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:
r2e run --no-send
Then later, you can ask it to email you new stories:
r2e run
If you get an error message "Sender domain must exist", add a line to config.py
like this:
DEFAULT_FROM = rss2email@yoursite.com
You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.
More than likely you will want rss2email to run automatically at a regular interval. Under Windows this is can be easily accomplished using the Windows Task Scheduler. This site has a nice tutorial on it. Just select r2e.bat as the program to run. Once you've created the task, double click on it in the task list and change the Run entry so that "run" comes after r2e.bat. For example, if you installed rss2email in the C:\rss2email folder, then you would change the Run entry from "C:\rss2email\r2e.bat" to "C:\rss2email\r2e.bat run".
Now jump down to the section on customizing rss2email to your needs.
Simply replace all of the files from the .ZIP package to your install directory EXCEPT config.py
Before you install rss2email, you'll need to make sure that a few things are in place. First, is a version of Python 2.x installed. Second, is whether you have sendmail (or a compatible replacement like postfix) installed. If sendmail isn't installed, determine your outgoing email server's address. That should be all you need.
A quick way to get rss2email going is using pre-made packages. Here are releases for Debian Linux, Ubuntu Linux and NetBSD.
If you are unable to use these packages or you want the latest and greatest version, here's what you do:
Unarchive (probably 'tar -xzf') the rss2email .tar.gz package to [folder where you want rss2email files to live]
cd [yourfolder]
chmod +x r2e
Create a new feed database with your target email address:
./r2e new you@yourdomain.com
Subscribe to some feeds:
./r2e add http://feeds.feedburner.com/allthingsrss/hJBr
That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.
When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:
./r2e run --no-send
Then later, you can ask it to email you new stories:
./r2e run
You probably want to set things up so that this command is run repeatedly. (One good way is via a cron job.)
If you get an error message "Sender domain must exist", add a line to config.py
like this:
DEFAULT_FROM = rss2email@yoursite.com
You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.
Simply replace all of the files from the .tar.gz package to your install directory EXCEPT config.py
There are a number of options, described in full at the top of rss2email.py file, to customize the way rss2email behaves. If you want to change something, edit the config.py
file. If you're not using rss2email under Windows, you'll have to create this file if it doesn't already exist.
For example, if you want to receive HTML mail, instead of having entries converted to plain text:
HTML_MAIL = 1
To be notified every time a post changes, instead of just when it's first posted:
TRUST_GUID = 0
And to make the emails look as if they were sent when the item was posted:
rss2email-2.71.orig/r2e.bat 0000755 0000000 0000000 00000000072 10406374010 014156 0 ustar root root @python rss2email.py feeds.dat %1 %2 %3 %4 %5 %6 %7 %8 %9 rss2email-2.71.orig/r2e 0000755 0000000 0000000 00000000053 10406374010 013410 0 ustar root root #!/bin/sh python rss2email.py feeds.dat $* rss2email-2.71.orig/config.py.example 0000755 0000000 0000000 00000006303 11467304276 016271 0 ustar root root ### Options for configuring rss2email ### # The email address messages are from by default: DEFAULT_FROM = "bozo@dev.null.invalid" # 1: Send text/html messages when possible. # 0: Convert HTML to plain text. HTML_MAIL = 1 # 1: Only use the DEFAULT_FROM address. # 0: Use the email address specified by the feed, when possible. FORCE_FROM = 0 # 1: Receive one email per post. # 0: Receive an email every time a post changes. TRUST_GUID = 1 # 1: Generate Date header based on item's date, when possible. # 0: Generate Date header based on time sent. DATE_HEADER = 1 # A tuple consisting of some combination of # ('issued', 'created', 'modified', 'expired') # expressing ordered list of preference in dates # to use for the Date header of the email. DATE_HEADER_ORDER = ('modified', 'issued', 'created') # 1: Apply Q-P conversion (required for some MUAs). # 0: Send message in 8-bits. # http://cr.yp.to/smtp/8bitmime.html #DEPRECATED QP_REQUIRED = 0 #DEPRECATED # 1: Name feeds as they're being processed. # 0: Keep quiet. VERBOSE = 0 # 1: Use the publisher's email if you can't find the author's. # 0: Just use the DEFAULT_FROM email instead. USE_PUBLISHER_EMAIL = 0 # 1: Use SMTP_SERVER to send mail. # 0: Call /usr/sbin/sendmail to send mail. SMTP_SEND = 1 SMTP_SERVER = "smtp.yourisp.net:25" AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1 SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here # Connect to the SMTP server using SSL SMTP_SSL = 0 # Set this to add a bonus header to all emails (start with '\n'). BONUS_HEADER = '' # Example: BONUS_HEADER = '\nApproved: joe@bob.org' # Set this to override From addresses. Keys are feed URLs, values are new titles. OVERRIDE_FROM = {} # Set this to override From email addresses. Keys are feed URLs, values are new emails. OVERRIDE_EMAIL = {} # Set this to default From email addresses. Keys are feed URLs, values are new email addresses. DEFAULT_EMAIL = {} # Only use the email from address rather than friendly name plus email address NO_FRIENDLY_NAME = 0 # Set this to override the timeout (in seconds) for feed server response FEED_TIMEOUT = 60 # Optional CSS styling USE_CSS_STYLING = 1 STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; } .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }' # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/' PROXY="" # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8'
DATE_HEADER = 1