本爬虫有以下几点:
- 可以爬取一个上亿页面的站点,可以多开,暂停,继续
- 可以自定义爬行路径,爬行延时
- 能够自动发现新链接,加入队列
以下为代码:
爬虫配置文件,cnblogs_com.py,可以配置开始页面,爬行页面,保存页面,头信息,延迟,超时时间。
# coding=utf-8
start_urls = [
'http://www.cnblogs.com/',
'http://news.cnblogs.com/',
'http://q.cnblogs.com/',
'http://home.cnblogs.com/blog/all/',
]
find_urls = [
r'^http://news\.cnblogs\.com/n/\d+/$',
r'^http://q.cnblogs.com/q/\d+/$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/default\.html\?page=\d+$',
r'^http://q\.cnblogs\.com/tag/',
]
save_urls = [
r'^http://news\.cnblogs\.com/n/\d+/$',
r'^http://q.cnblogs.com/q/\d+/$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',
r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',
]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
"Referer": "http://www.hisearch.cn/",
}
delay = 2
timeout = 5
爬虫开始命令行,可以运行 python spider.py -s cnblogs_com start
开始爬取一个站点,也可多开,中间进程挂了下次可以继续爬
#!/usr/bin/python
# coding=utf-8
import argparse
from lib.Spider import Spider
allow_commands = ['start', 'clean']
if __name__ == '__main__':
# 解析参数
parser = argparse.ArgumentParser(description='General crawler')
parser.add_argument('-s', '--site', help='site config file name', required=True)
parser.add_argument('command', help='|'.join(allow_commands), type=str)
args = parser.parse_args()
command = args.command
# 执行程序
s = Spider(args.site)
if command == 'start':
s.start()
elif command == 'clean':
s.clean()
elif command == 'restart':
s.restart()
else:
print('%s is not in a valid command, allowed: %s' % (command, '|'.join(allow_commands)))
爬虫类: Spider.py 主要采用了leveldb来存储爬取的数据,leveldb能够对抓取的数据进行压缩,用redis做队列,redis的hyperloglog数据格式能够用非常少的内存来做url去重
# coding=utf-8
"""
爬虫类
"""
import time
import sys
import traceback
import logging
from logging.handlers import TimedRotatingFileHandler
import re
import redis
import uuid
import requests
from conf import settings
import leveldb
from lxml.html import fromstring
class Spider(object):
site = None
config = None
que = None
log = None
db = None
request = None
def __init__(self, site):
self.site = site
self.load_config()
self.que = redis.from_url(settings.REDIS_URI)
self.log = self.get_logger()
self.db = leveldb.LevelDB(settings.DATA_DIR + self.site, max_open_files=30)
self.request = requests.session()
def start(self):
if not self.is_started():
self.que.sadd('running_sites', self.site)
for url in self.config.start_urls:
self.que.pfadd(self.site + '_all', url)
self.que.lpush(self.site + '_in', url)
self.run()
def run(self):
while not self.que_is_empty():
url = self.que.rpop(self.site + '_in').decode()
html = self.get_page(url)
if html is not None:
data = self.get_data(html, url)
if data:
self.store_data(url, data)
self.find_more_links(html, url)
time.sleep(self.config.delay)
self.finish()
def que_is_empty(self):
if self.que.llen(self.site + '_in') == 0:
return True
else:
return False
def load_config(self):
self.config = __import__('conf.sites.' + self.site, fromlist=['conf.sites.' + self.site])
def is_started(self):
if self.que.sismember('running_sites', self.site):
self.log.info("%s is started yet." % (self.site))
return True
else:
self.log.info("%s is not start." % (self.site))
return False
def get_page(self, url):
html = None
try:
r = self.request.get(url, headers=self.config.headers, timeout=self.config.timeout)
if r.ok:
html = r.text
r.close()
self.log.debug("page_download: " + url)
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
self.log.exception("download_error: " + url + ", " + str(exc_value),
exc_info=traceback.format_tb(exc_traceback))
return html
def get_data(self, html, url):
for regxp in self.config.save_urls:
if re.compile(regxp).match(url):
return html
return False
def store_data(self, url, data):
self.db.Put(url.encode(), data.encode())
self.log.debug("page_saved: %s" % url)
def find_more_links(self, html, url):
try:
page = fromstring(html, url)
page.make_links_absolute(url)
for element, attribute, link, pos in page.iterlinks():
for regxp in self.config.find_urls:
if re.compile(regxp).match(link):
self.add_url(link)
break
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
self.log.exception("find_more_links_error: " + url + ", " + str(exc_value),
exc_info=traceback.format_tb(exc_traceback))
def add_url(self, url):
if self.que.pfadd(self.site + '_all', url) == 1:
key = url.encode()
if key not in self.db.RangeIter(include_value=False, key_from=key, key_to=key):
self.que.lpush(self.site + '_in', url)
self.log.debug("page_found: " + url)
else:
self.log.debug("page_exist: " + url)
def finish(self):
self.que.srem('running_sites', self.site)
self.que.delete(self.site + '_all')
self.log.info('finished')
def clean(self):
self.que.srem('running_sites', self.site)
self.que.delete(self.site + '_all')
self.que.delete(self.site + '_in')
self.log.info('cleaned')
def restart(self):
self.clean()
self.start()
def get_logger(self):
logger = logging.getLogger('spider.' + self.site)
hd = TimedRotatingFileHandler(settings.LOG_DIR + self.site + '.log', when='D', backupCount=30)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
hd.setFormatter(formatter)
logger.addHandler(hd)
logger.setLevel(logging.DEBUG)
return logger
def get_doc_id_by_url(self, url):
return str(uuid.uuid5(uuid.NAMESPACE_URL, url))