标签归档:Python

使用Splash解决爬取页面时需要执行JS问题

写 Scrapy 爬虫时,遇到了 js 进行跳转的页面,大家有没有好的解决方法?

答案是:

splash

Splash is a javascript rendering service with an HTTP API. It’s a lightweight browser with an HTTP API, implemented in Python 3 using Twisted and QT5.

It’s fast, lightweight and state-less which makes it easy to distribute.

Documentation Documentation is available here: https://splash.readthedocs.io/

scrapy-splash

This library provides Scrapy and JavaScript integration using Splash. The license is BSD 3-clause.

参考:

HTML转为PDF的两种方案(含nodejs、PHP以及Python三种实现代码)

采用chrome headless方案

为什么要采用Chrome headless

因为wkhtmltopdf内置的为qt的webkit,已经很久不更新了,很多css3以及html5都支持不友好。

Chrome官方提供的页面转换为PDF的接口

https://chromedevtools.github.io/devtools-protocol/tot/Page#method-printToPDF

命令行方式

chrome --headless --print-to-pdf=path/to/file.pdf https://example.com

参考:HTML to PDF conversion using Chrome pdfium?

NodeJS扩展

html-pdf-chrome HTML to PDF converter via Chrome/Chromium.

PHP扩展

chrome-html-to-pdf Converts HTML to PDF using Google Chrome

Chrome命令行参数列表

List of Chromium Command Line Switches

采用Qt的Webkit(PyQt5)

由于当前的chrome转换存在BUG,转换大文件时内存消耗特别大,生成的文件也比较大,测试了10多种方法后,最后决定采用PyQt5来做

#!/usr/bin/env python3

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

 采用qt打印

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineProfile
from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter, QPrintDialog


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        self.printer = QPrinter()
        self.printer.setPageSize(QPrinter.A5)
        self.printer.setOrientation(QPrinter.Portrait)
        self.printer.setOutputFormat(QPrinter.PdfFormat)
        self.printer.setOutputFileName(filename)
        self.printer.setPageMargins(0, 0, 0, 0, QPrinter.Millimeter)
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.page().profile().setHttpCacheMaximumSize(5 * 1024 * 1024 * 1024)
        self.page().profile().setHttpCacheType(QWebEngineProfile.MemoryHttpCache)
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished2)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)

    def load_finished2(self):
        self.show()
        self.page().print(self.printer, on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

使用firefox的pdf

slimer-html-pdf – convert any HTML document to PDF format using slimerjs (Gecko)

大文件合并(Python)

 def on_pdf_finished(self, result):
        if result:
            print(result + ', total ' + str(self.total))
        else:
            print("导出失败")
        self.printed = self.printed + 1
        print('导出第', self.printed, '本')
        if self.printed < self.total:
            self.print_book()
        else:
            print('开始合并')
            merger = PdfFileMerger()
            for index in range(0, self.total):
                filepath = self.filename + '.' + str(index) + '.pdf'
                merger.append(filepath)
                print('合并第', index, '本')
            merger.write(self.filename)
            merger.close()
            print('合并完成,开始清除临时文件')
            # for index in range(0, self.total):
            #     filepath = self.filename + '.' + str(index) + '.pdf'
            #     os.remove(filepath)
            print('清除临时文件完成')
            QApplication.exit()

python+selenium自动登录淘宝网并获取订单数据

本文将介绍使用python3+ selenium自动登录淘宝,并获取订单信息。

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.action_chains import ActionChains

import unittest, time, re


class Login(unittest.TestCase):
    def setUp(self):
        # binary = FirefoxBinary(r'C:\Program Files\Mozilla Firefox\firefox.exe')
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.driver.maximize_window()
        self.base_url = "https://www.taobao.com/"
        self.verificationErrors = []
        self.accept_next_alert = True

    def test_login(self):
        driver = self.driver
        # 打开登录页面
        driver.get("https://login.taobao.com/member/login.jhtml?&redirectURL=http%3A%2F%2Fwww.taobao.com%2F")
        self.driver.implicitly_wait(5)
        # 点击账号密码登录
        driver.find_element_by_id('J_Quick2Static').click()
        self.driver.implicitly_wait(2)
        # 输入账号密码
        driver.find_element_by_id("TPL_username_1").clear()
        driver.find_element_by_id("TPL_username_1").send_keys("username")
        driver.find_element_by_id("TPL_password_1").clear()
        driver.find_element_by_id("TPL_password_1").send_keys("passwd")
        driver.implicitly_wait(5)
        # 判断是否显示滑块,如果显示则拖动滑块
        # 提交表单
        driver.find_element_by_id("J_SubmitStatic").click()
        while True:
            if driver.find_element_by_link_text("卖家中心").is_displayed():
                break
        # 查看已卖出的宝贝
        driver.execute_script('window.location="https://trade.taobao.com/trade/detail/trade_order_detail.htm?biz_order_id=xxxxxx"')
        driver.implicitly_wait(5)
        driver.find_element_by_link_text("收货和物流信息").click()
        print(driver.page_source)


    def tearDown(self):
        # self.driver.quit()
        self.assertEqual([], self.verificationErrors)


if __name__ == "__main__":
    unittest.main()

fedora27+python3+virtualenv+virtualenvwrapper安装及使用

安装virtualenvwrapper

这里使用pip3进行安装即可

sudo pip3 install virtualenvwrapper

设置virtualenvwrapper的运行环境变量

编辑~/.bashrc,加入以下几行

VIRTUALENVWRAPPER_PYTHON=/usr/bin/python3 
export WORKON_HOME='~/.virtualenvs'
source /usr/local/bin/virtualenvwrapper.sh

使环境变量马上生效:

source ~/.bashrc

virtualenvwrapper使用

mkvirtualenv 新建虚拟环境

mkvirtualenv env1

建立后可以看到终端会以env1开头

(env1) [ning@localhost]$

再建立一个env2

mkvirtualenv env2

workon 启动/切换虚拟环境

workon env1

deactivate 离开虚拟环境

deactivate

rmvirtualenv 删除虚拟环境

rmvirtualenv env2

virtualenvwrapper help 查看virtualenvwrapper使用帮助

virtualenvwrapper is a set of extensions to Ian Bicking's virtualenv
tool.  The extensions include wrappers for creating and deleting
virtual environments and otherwise managing your development workflow,
making it easier to work on more than one project at a time without
introducing conflicts in their dependencies.

For more information please refer to the documentation:

    http://virtualenvwrapper.readthedocs.org/en/latest/command_ref.html

Commands available:

  add2virtualenv: add directory to the import path

  allvirtualenv: run a command in all virtualenvs

  cdproject: change directory to the active project

  cdsitepackages: change to the site-packages directory

  cdvirtualenv: change to the $VIRTUAL_ENV directory

  cpvirtualenv: duplicate the named virtualenv to make a new one

  lssitepackages: list contents of the site-packages directory

  lsvirtualenv: list virtualenvs

  mkproject: create a new project directory and its associated virtualenv

  mktmpenv: create a temporary virtualenv

  mkvirtualenv: Create a new virtualenv in $WORKON_HOME

  rmvirtualenv: Remove a virtualenv

  setvirtualenvproject: associate a project directory with a virtualenv

  showvirtualenv: show details of a single virtualenv

  toggleglobalsitepackages: turn access to global site-packages on/off

  virtualenvwrapper: show this help message

  wipeenv: remove all packages installed in the current virtualenv

  workon: list or change working virtualenvs

virtualenv –help 帮助

Usage: virtualenv [OPTIONS] DEST_DIR

Options:
  --version             show program's version number and exit
  -h, --help            show this help message and exit
  -v, --verbose         Increase verbosity.
  -q, --quiet           Decrease verbosity.
  -p PYTHON_EXE, --python=PYTHON_EXE
                        The Python interpreter to use, e.g.,
                        --python=python2.5 will use the python2.5 interpreter
                        to create the new environment.  The default is the
                        interpreter that virtualenv was installed with
                        (/usr/bin/python3)
  --clear               Clear out the non-root install and start from scratch.
  --no-site-packages    DEPRECATED. Retained only for backward compatibility.
                        Not having access to global site-packages is now the
                        default behavior.
  --system-site-packages
                        Give the virtual environment access to the global
                        site-packages.
  --always-copy         Always copy files rather than symlinking.
  --unzip-setuptools    Unzip Setuptools when installing it.
  --relocatable         Make an EXISTING virtualenv environment relocatable.
                        This fixes up scripts and makes all .pth files
                        relative.
  --no-setuptools       Do not install setuptools in the new virtualenv.
  --no-pip              Do not install pip in the new virtualenv.
  --no-wheel            Do not install wheel in the new virtualenv.
  --extra-search-dir=DIR
                        Directory to look for setuptools/pip distributions in.
                        This option can be used multiple times.
  --download            Download preinstalled packages from PyPI.
  --no-download, --never-download
                        Do not download preinstalled packages from PyPI.
  --prompt=PROMPT       Provides an alternative prompt prefix for this
                        environment.
  --setuptools          DEPRECATED. Retained only for backward compatibility.
                        This option has no effect.
  --distribute          DEPRECATED. Retained only for backward compatibility.
                        This option has no effect.

Windows上安装Django2(Python3+virtualenv+virtualenvwrapper)

本文档将指导您在Windows上安装Python 3.5和Django。它还提供了安装virtualenv和virtualenvwrapper的指导,这使得在Python项目上工作变得更容易。这是为使用Django项目的用户提供的初学者指南,并不反映在为Django本身开发补丁程序时如何安装Django。

本指南中的步骤已经通过Windows 7、8和10进行了测试。在其他版本中,步骤是类似的。您需要熟悉使用Windows命令提示符。

安装Python

Django是一个Python的web框架,因此需要在你的机器上安装Python。在编写的时候,Python 3.6是最新版本。

要在您的机器上安装Python,请访问https://python.org/downloads/。该网站应该为您提供最新的Python版本的下载按钮。下载可执行安装程序并运行它。选中“Add Python 3.6 to PATH”旁边的复选框,然后单击“Install Now”。

安装完成后,打开命令提示符并检查Python版本是否与执行的安装版本相匹配:

python --version

关于 pip

pip是Python的包管理。它使得安装和卸载Python包(如Django!)非常简单。对于安装的其余部分,我们将使用pip从命令行安装Python包。

要在您的机器上安装pip,请转到https://pip.pypa.io/en/latest/installing/,然后按照使用get-pip.py安装说明进行操作。

安装 virtualenv 和 virtualenvwrapper

virtualenv和virtualenvwrapper为您创建的每个Django项目提供了一个专用的环境。虽然不是强制性的,但这被认为是最佳实践,在您准备好部署项目时将为您节省时间。只需输入:

pip install virtualenvwrapper-win

然后为您的项目创建一个虚拟环境:

mkvirtualenv myproject

虚拟环境将被自动激活,您将在命令提示符旁边看到(myproject),以指定它。如果您启动一个新的命令提示符,您将需要再次激活环境

workon myproject

安装 Django

Django可以在您的虚拟环境中使用pip轻松安装。

在命令提示符下,确保您的虚拟环境处于活动状态,然后执行以下命令:

pip install django

这将下载并安装最新的Django版本。

安装完成后,您可以通过在命令提示符下执行django-admin --version来验证您的Django安装。

请参阅运行您的数据库以获取有关使用Django安装数据库的信息。

Wagtail demo 安装指南

学习Wagtail最快的办法可能就是下载最新的Demo先进行体验了。

Demo地址:https://github.com/wagtail/bakerydemo

https://github.com/wagtail/bakerydemo.git
cd bakerydemo
pip install -r requirements.txt

Windows下安装可能会出现安装uwsgi提示 module ‘os’ has no attribute ‘uname’。解决办法是修改requirements/production.txt重新运行 pip install -r requirements.txt即可

接下来,我们将设置我们的本地环境变量。我们使用django-dotenv来解决这个问题。它读取位于项目顶层目录中的文件名.env中的环境变量。我们需要启动的唯一变量是DJANGO_SETTINGS_MODULE:

cp bakerydemo/settings/local.py.example bakerydemo/settings/local.py
echo "DJANGO_SETTINGS_MODULE=bakerydemo.settings.local" > .env

修改bakerydemo.settings.local.py中的数据库配置

DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.postgresql',
        'NAME': 'bakerydemo',
        'USER': 'postgres',
        'PASSWORD': 'password',
        'HOST': '127.0.0.1',
        'PORT': '5432',
    }
}

要设置数据库并加载初始数据,请运行以下命令:

./manage.py migrate
./manage.py load_initial_data
./manage.py runserver

使用 admin / changeme 登录到管理后台。

使用elementtree处理大的xml

这个xml超过了30G,关键是iterparse以及e.clear()

from elementtree.ElementTree import iterparse
from datetime import datetime
import redis
import json
import lxml.html
import re
import traceback
import time
import cgi


def main():
    redisConn = redis.from_url("redis://localhost:6379/0")
    i = 0
    xmlfile = "/data/download/Posts.xml"
    for event, e in iterparse(xmlfile):
        if e.tag == "row" and e.get("PostTypeId") == "1":
            try:
                data = {
                    "url": "http://stackoverflow.com/questions/" + e.get("Id"),
                    "title": cgi.escape(e.get("Title")),
                    "content": cgi.escape(lxml.html.fromstring(e.get("Body")).text_content()),
                    "tags": cgi.escape(",".join(re.findall("<([^>]+)>", e.get("Tags")))),
                    'site': 'stackoverflow',
                    "timestamp": datetime.now().isoformat()
                }
                redisConn.lpush('ResultQueue', json.dumps(data))
            except:
                traceback.print_exc()
                print e.attrib
                continue
        i += 1
        if i % 1000 == 0:
            time.sleep(5);
            print i
        e.clear()


if __name__ == '__main__':
    main()