Processing Feeds

Getting Setup

$ easy_install FeedParser

Making a Console Script

  • Add a feedstool/scripts dir and make the following in feedstool/scripts/update_feeds.py:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
""" update_feeds [OPTIONS] 

Iterate through each Feed in the FeedContainer, download each Atom
feed, and make new FeedEntry objects.

Options
=======

--config (-c)            Path to ini file (defaults to $CWD/FeedsTool.ini)

--dry-run (-d)           Don't actually commit the transaction

"""

import getopt
import sys
import os
import transaction

from paste.deploy import loadapp
from repoze.bfg.registry import registry_manager
import feedparser
from feedstool.models.interfaces import IFeedEntry

from repoze.lemonade.content import create_content

def log_debug(msg):
    pass

def update(site):

    fmt = "Feed %s failed to parse because %s on line %s"

    for feed in site.values():
        print "Processing", feed.title, "at", feed.url
        d = feedparser.parse(feed.url)

        import pdb;pdb.set_trace()

        if d.bozo==1:
            exc = d.bozo_exception()
            msg = fmt % (feed.url, exc.getMessage(), exc.getLineNumber())
            log_debug(msg % msg)

        # XXX Store etag and last-modified to conserve bandwidth and
        # not get banned from some feeds. 
        feed_title = d.feed.title
        feed_subtitle = d.feed.subtitle
        feed_etag = d.etag
        feed_id = d.feed.id
        feed_updated = d.feed.updated
        for entry in d['entries']:
            title = entry.title
            link = entry.link
            id = entry.id
            published = entry.published
            updated = entry.updated
            summary = entry.summary
            content = entry.content[0]['value']
            fe = create_content(
                IFeedEntry, title, summary
                )
            name = id

    return

def usage(self, message=None, rc=1):
    print __doc__
    if message is not None:
        print message
        print
    sys.exit(rc)

def main(argv=sys.argv):
    name, argv = argv[0], argv[1:]

    try:
        opts, args = getopt.getopt(argv, 'c:dh?',
                                         ['config=',
                                          'dry-run',
                                          'help'
                                         ])
    except getopt.GetoptError, e:
        usage(e)

    config = None
    dry_run = False

    for k, v in opts:
        if k in ('-c', '--config'):
            config = v
        if k in ('-d', '--dry-run'):
            dry_run = True
        elif k in ('-h', '-?', '--help'):
            usage(rc=2)
        
    if config is None:
        # we assume that the console script lives in the 'bin' dir of a
        # sandbox or buildout, and that the .ini file lives in the 'etc'
        # directory of the sandbox or buildout
        me = sys.argv[0]
        me = os.path.abspath(me)
        sandbox = os.path.dirname(os.path.dirname(os.path.dirname(me)))
        #config = os.path.join(sandbox, 'etc', 'osi.ini')
        config = os.path.join(sandbox, 'FeedsTool.ini')

    config = os.path.abspath(os.path.normpath(config))

    app = loadapp('config:%s' % config, name='zodb')
    environ = {}
    registry_manager.set(app.registry)
    root = app.root_policy(environ)
    update(root)
    if dry_run:
        transaction.abort()
    else:
        transaction.commit()

if __name__ == '__main__':
    main()
  • Usually we edit setup.py to add an entry point which buildout would use to generate something in bin. For now, we’ll just run it from the command line.
  • Feed parsing
    • easy_install FeedParser
    • command-line script to handle feeds
    • Use catalog to display feedentry content in folder (interface, sorting, batching)
  • Delete Data.fs, start server, add a Feed pointed at the URL of http://feedparser.org/docs/examples/atom10.xml
  • Make sure you are in a prompt that did source bin/activate and then run:
$ python feedstool/scripts/update_feeds.py

Table Of Contents

Previous topic

Cataloging

Next topic

KARL Forms

This Page