标签归档:iterparse

使用elementtree处理大的xml

这个xml超过了30G,关键是iterparse以及e.clear()

from elementtree.ElementTree import iterparse
from datetime import datetime
import redis
import json
import lxml.html
import re
import traceback
import time
import cgi


def main():
    redisConn = redis.from_url("redis://localhost:6379/0")
    i = 0
    xmlfile = "/data/download/Posts.xml"
    for event, e in iterparse(xmlfile):
        if e.tag == "row" and e.get("PostTypeId") == "1":
            try:
                data = {
                    "url": "http://stackoverflow.com/questions/" + e.get("Id"),
                    "title": cgi.escape(e.get("Title")),
                    "content": cgi.escape(lxml.html.fromstring(e.get("Body")).text_content()),
                    "tags": cgi.escape(",".join(re.findall("<([^>]+)>", e.get("Tags")))),
                    'site': 'stackoverflow',
                    "timestamp": datetime.now().isoformat()
                }
                redisConn.lpush('ResultQueue', json.dumps(data))
            except:
                traceback.print_exc()
                print e.attrib
                continue
        i += 1
        if i % 1000 == 0:
            time.sleep(5);
            print i
        e.clear()


if __name__ == '__main__':
    main()