这个xml超过了30G,关键是iterparse以及e.clear()
from elementtree.ElementTree import iterparse
from datetime import datetime
import redis
import json
import lxml.html
import re
import traceback
import time
import cgi
def main():
redisConn = redis.from_url("redis://localhost:6379/0")
i = 0
xmlfile = "/data/download/Posts.xml"
for event, e in iterparse(xmlfile):
if e.tag == "row" and e.get("PostTypeId") == "1":
try:
data = {
"url": "http://stackoverflow.com/questions/" + e.get("Id"),
"title": cgi.escape(e.get("Title")),
"content": cgi.escape(lxml.html.fromstring(e.get("Body")).text_content()),
"tags": cgi.escape(",".join(re.findall("<([^>]+)>", e.get("Tags")))),
'site': 'stackoverflow',
"timestamp": datetime.now().isoformat()
}
redisConn.lpush('ResultQueue', json.dumps(data))
except:
traceback.print_exc()
print e.attrib
continue
i += 1
if i % 1000 == 0:
time.sleep(5);
print i
e.clear()
if __name__ == '__main__':
main()