如何用python实现一个高自由度爬虫

本爬虫有以下几点:

  • 可以爬取一个上亿页面的站点,可以多开,暂停,继续
  • 可以自定义爬行路径,爬行延时
  • 能够自动发现新链接,加入队列

以下为代码:

爬虫配置文件,cnblogs_com.py,可以配置开始页面,爬行页面,保存页面,头信息,延迟,超时时间。

# coding=utf-8

start_urls = [
    'http://www.cnblogs.com/',
    'http://news.cnblogs.com/',
    'http://q.cnblogs.com/',
    'http://home.cnblogs.com/blog/all/',
]
find_urls = [
    r'^http://news\.cnblogs\.com/n/\d+/$',
    r'^http://q.cnblogs.com/q/\d+/$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/default\.html\?page=\d+$',
    r'^http://q\.cnblogs\.com/tag/',
]
save_urls = [
    r'^http://news\.cnblogs\.com/n/\d+/$',
    r'^http://q.cnblogs.com/q/\d+/$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/p/\d+.html$',
    r'^http://www\.cnblogs\.com/[a-zA-Z0-9\-_]+/archive/\d+/\d+/\d+/\d+.html$',
]
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
    "Referer": "http://www.hisearch.cn/",
}
delay = 2
timeout = 5

爬虫开始命令行,可以运行 python spider.py -s cnblogs_com start 开始爬取一个站点,也可多开,中间进程挂了下次可以继续爬

#!/usr/bin/python
# coding=utf-8


import argparse
from lib.Spider import Spider

allow_commands = ['start', 'clean']

if __name__ == '__main__':
    # 解析参数
    parser = argparse.ArgumentParser(description='General crawler')
    parser.add_argument('-s', '--site', help='site config file name', required=True)
    parser.add_argument('command', help='|'.join(allow_commands), type=str)
    args = parser.parse_args()
    command = args.command
    # 执行程序
    s = Spider(args.site)
    if command == 'start':
        s.start()
    elif command == 'clean':
        s.clean()
    elif command == 'restart':
        s.restart()
    else:
        print('%s is not in a valid command, allowed: %s' % (command, '|'.join(allow_commands)))

爬虫类: Spider.py 主要采用了leveldb来存储爬取的数据,leveldb能够对抓取的数据进行压缩,用redis做队列,redis的hyperloglog数据格式能够用非常少的内存来做url去重

# coding=utf-8
"""
爬虫类
"""

import time
import sys
import traceback
import logging
from logging.handlers import TimedRotatingFileHandler
import re
import redis
import uuid
import requests
from conf import settings
import leveldb
from lxml.html import fromstring


class Spider(object):
    site = None
    config = None
    que = None
    log = None
    db = None
    request = None

    def __init__(self, site):
        self.site = site
        self.load_config()
        self.que = redis.from_url(settings.REDIS_URI)
        self.log = self.get_logger()
        self.db = leveldb.LevelDB(settings.DATA_DIR + self.site, max_open_files=30)
        self.request = requests.session()

    def start(self):
        if not self.is_started():
            self.que.sadd('running_sites', self.site)
            for url in self.config.start_urls:
                self.que.pfadd(self.site + '_all', url)
                self.que.lpush(self.site + '_in', url)
        self.run()

    def run(self):
        while not self.que_is_empty():
            url = self.que.rpop(self.site + '_in').decode()
            html = self.get_page(url)
            if html is not None:
                data = self.get_data(html, url)
                if data:
                    self.store_data(url, data)
                self.find_more_links(html, url)
            time.sleep(self.config.delay)
        self.finish()

    def que_is_empty(self):
        if self.que.llen(self.site + '_in') == 0:
            return True
        else:
            return False

    def load_config(self):
        self.config = __import__('conf.sites.' + self.site, fromlist=['conf.sites.' + self.site])

    def is_started(self):
        if self.que.sismember('running_sites', self.site):
            self.log.info("%s is started yet." % (self.site))
            return True
        else:
            self.log.info("%s is not start." % (self.site))
            return False

    def get_page(self, url):
        html = None
        try:
            r = self.request.get(url, headers=self.config.headers, timeout=self.config.timeout)
            if r.ok:
                html = r.text
            r.close()
            self.log.debug("page_download: " + url)
        except:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.log.exception("download_error: " + url + ", " + str(exc_value),
                               exc_info=traceback.format_tb(exc_traceback))
        return html

    def get_data(self, html, url):
        for regxp in self.config.save_urls:
            if re.compile(regxp).match(url):
                return html
        return False

    def store_data(self, url, data):
        self.db.Put(url.encode(), data.encode())
        self.log.debug("page_saved: %s" % url)

    def find_more_links(self, html, url):
        try:
            page = fromstring(html, url)
            page.make_links_absolute(url)
            for element, attribute, link, pos in page.iterlinks():
                for regxp in self.config.find_urls:
                    if re.compile(regxp).match(link):
                        self.add_url(link)
                        break
        except:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.log.exception("find_more_links_error: " + url + ", " + str(exc_value),
                               exc_info=traceback.format_tb(exc_traceback))

    def add_url(self, url):
        if self.que.pfadd(self.site + '_all', url) == 1:
            key = url.encode()
            if key not in self.db.RangeIter(include_value=False, key_from=key, key_to=key):
                self.que.lpush(self.site + '_in', url)
                self.log.debug("page_found: " + url)
            else:
                self.log.debug("page_exist: " + url)

    def finish(self):
        self.que.srem('running_sites', self.site)
        self.que.delete(self.site + '_all')
        self.log.info('finished')

    def clean(self):
        self.que.srem('running_sites', self.site)
        self.que.delete(self.site + '_all')
        self.que.delete(self.site + '_in')
        self.log.info('cleaned')

    def restart(self):
        self.clean()
        self.start()

    def get_logger(self):
        logger = logging.getLogger('spider.' + self.site)
        hd = TimedRotatingFileHandler(settings.LOG_DIR + self.site + '.log', when='D', backupCount=30)
        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        logger.addHandler(hd)
        logger.setLevel(logging.DEBUG)
        return logger

    def get_doc_id_by_url(self, url):
        return str(uuid.uuid5(uuid.NAMESPACE_URL, url))