分类目录归档：Python

Python 修改PDF文档尺寸以及去除水印图片

# -*- coding: UTF-8 -*-

import sys
import os

from pdfrw import PageMerge, PdfReader, PdfWriter, IndirectPdfDict
import fitz


# resize
def adjust(page):
    info = PageMerge().add(page)
    x1, y1, x2, y2 = info.xobj_box
    viewrect = ((x2 - 421) / 2, (y2 - 595) / 2, 421, 595)
    page = PageMerge().add(page, viewrect=viewrect)
    return page.render()

fin, = sys.argv[1:]
fout = 'mid.' + os.path.basename(fin)
reader = PdfReader(fin)
writer = PdfWriter(fout)
for p in reader.pages:
    writer.addpage(adjust(p))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write()

# trip backgroud images
doc = fitz.open(fout)
for i in range(len(doc)):
    imglist = doc.getPageImageList(i)
    for img in imglist:
        xref = img[0]
        if xref==51:
            doc._deleteObject(xref)
        print(img)
doc.save('new.' + os.path.basename(fin))

Python通用logging方法：控制台输出以及文件自动切分

import logging
from logging.handlers import TimedRotatingFileHandler

import sys

def get_logger():
    logger = logging.getLogger()
    hd = logging.StreamHandler(sys.stdout)
    # hd = TimedRotatingFileHandler('mylog.log', when='D',maxBytes=10240000 backupCount=30)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    hd.setFormatter(formatter)
    logger.addHandler(hd)
    logger.setLevel(logging.DEBUG)
    return logger

django rest framework token验证指南

安装app

修改settings.py增加rest_framework.authtoken

INSTALLED_APPS = (
    'rest_framework',
    'rest_framework.authtoken',
    'myapp',
)

增加权限验证

REST_FRAMEWORK = {
    'DEFAULT_PERMISSION_CLASSES': (
        'rest_framework.permissions.IsAdminUser',
    ),
    'DEFAULT_AUTHENTICATION_CLASSES': (
        'rest_framework.authentication.SessionAuthentication',
        'rest_framework.authentication.TokenAuthentication',
    )
}

升级表可以看到增加了authtoken_token表

python manage.py migrate

添加url

编辑urls.py

from rest_framework.authtoken import views

urlpatterns = [
    path('admin/', admin.site.urls),
    url(r'^', include(router.urls)),
    url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
    url(r'^api-token-auth/', views.obtain_auth_token),
]

获取token

http POST 127.0.0.1:8000/api-token-auth/ username=’admin’ password=’password’

{
    "token": "9d1ff379e5e380c143ceadb66dde26b2b09dd4ab"
}

查看验证

http GET 127.0.0.1:8000/users/

{
    "detail": "身份认证信息未提供。"
}

http GET 127.0.0.1:8000/users/ ‘Authorization:Token 9d1ff379e5e380c143ceadb66dde26b2b09dd4ab’

{
    "count": 1,
    "next": null,
    "previous": null,
    "results": [
        {
            "date_joined": "2018-09-12T22:46:35.919532+08:00",
            "id": 1,
            "url": "http://127.0.0.1:8000/users/1/",
            "username": "admin"
        }
    ]
}

参考

Python字符串转unicode emoji

Converting emojis to Unicode and vice versa in python 3

# -*- coding: UTF-8 -*-

text = u"?"
print(text.encode('unicode-escape').decode('ASCII')) # output: \U0001f188\ue513\ue220\ue21c

使用Splash解决爬取页面时需要执行JS问题

写 Scrapy 爬虫时，遇到了 js 进行跳转的页面，大家有没有好的解决方法？

答案是：

splash

Splash is a javascript rendering service with an HTTP API. It’s a lightweight browser with an HTTP API, implemented in Python 3 using Twisted and QT5.

It’s fast, lightweight and state-less which makes it easy to distribute.

Documentation Documentation is available here: https://splash.readthedocs.io/

scrapy-splash

This library provides Scrapy and JavaScript integration using Splash. The license is BSD 3-clause.

参考：

写 Scrapy 爬虫时，遇到了 js 进行跳转的页面，大家有没有好的解决方法

HTML转为PDF的两种方案(含nodejs、PHP以及Python三种实现代码)

采用chrome headless方案

为什么要采用Chrome headless

因为wkhtmltopdf内置的为qt的webkit，已经很久不更新了，很多css3以及html5都支持不友好。

Chrome官方提供的页面转换为PDF的接口

https://chromedevtools.github.io/devtools-protocol/tot/Page#method-printToPDF

命令行方式

chrome --headless --print-to-pdf=path/to/file.pdf https://example.com

参考：HTML to PDF conversion using Chrome pdfium?

NodeJS扩展

html-pdf-chrome HTML to PDF converter via Chrome/Chromium.

PHP扩展

chrome-html-to-pdf Converts HTML to PDF using Google Chrome

Chrome命令行参数列表

List of Chromium Command Line Switches

采用Qt的Webkit(PyQt5)

由于当前的chrome转换存在BUG，转换大文件时内存消耗特别大，生成的文件也比较大，测试了10多种方法后，最后决定采用PyQt5来做

#!/usr/bin/env python3

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

　采用qt打印

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineProfile
from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter, QPrintDialog


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        self.printer = QPrinter()
        self.printer.setPageSize(QPrinter.A5)
        self.printer.setOrientation(QPrinter.Portrait)
        self.printer.setOutputFormat(QPrinter.PdfFormat)
        self.printer.setOutputFileName(filename)
        self.printer.setPageMargins(0, 0, 0, 0, QPrinter.Millimeter)
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.page().profile().setHttpCacheMaximumSize(5 * 1024 * 1024 * 1024)
        self.page().profile().setHttpCacheType(QWebEngineProfile.MemoryHttpCache)
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished2)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)

    def load_finished2(self):
        self.show()
        self.page().print(self.printer, on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

使用firefox的pdf

slimer-html-pdf – convert any HTML document to PDF format using slimerjs (Gecko)

大文件合并(Python)

 def on_pdf_finished(self, result):
        if result:
            print(result + ', total ' + str(self.total))
        else:
            print("导出失败")
        self.printed = self.printed + 1
        print('导出第', self.printed, '本')
        if self.printed < self.total:
            self.print_book()
        else:
            print('开始合并')
            merger = PdfFileMerger()
            for index in range(0, self.total):
                filepath = self.filename + '.' + str(index) + '.pdf'
                merger.append(filepath)
                print('合并第', index, '本')
            merger.write(self.filename)
            merger.close()
            print('合并完成，开始清除临时文件')
            # for index in range(0, self.total):
            #     filepath = self.filename + '.' + str(index) + '.pdf'
            #     os.remove(filepath)
            print('清除临时文件完成')
            QApplication.exit()

django2 + uwsgi + nginx

安装uwsgi模块

pip install uwsgi

测试uwsgi服务

uwsgi --http 0.0.0.0:8080 --file project/wsgi.py --static-map=/static=static

配置uwsgi.ini

# uwsig使用配置文件启动
[uwsgi]
# 项目目录
chdir=/data/pyproject/zc1024
# 指定项目的application
module=zc1024.wsgi:application
# 指定sock的文件路径
socket=/data/pyproject/zc1024/tmp/uwsgi.sock
# 进程个数
workers=4
pidfile=/data/pyproject/zc1024/tmp/uwsgi.pid
# 指定IP端口
http=127.0.0.1:8080
# 指定静态文件
static-map=/static=/data/pyproject/zc1024/static
# 启动uwsgi的用户名和用户组
uid=ning
gid=ning
# 启用主进程
master=true
# 自动移除unix Socket和pid文件当服务停止的时候
vacuum=true
# 序列化接受的内容，如果可能的话
thunder-lock=true
# 启用线程
enable-threads=true
# 设置自中断时间
harakiri=30
# 设置缓冲
post-buffering=4096
# 设置日志目录
daemonize=/data/pyproject/zc1024/tmp/uwsgi.log

运行配置

uwsgi --ini uwsgi.ini

配置nginx

 # 指定项目路径uwsgi
location / { # 这个location就和咱们Django的url(r'^admin/', admin.site.urls),
include uwsgi_params; # 导入一个Nginx模块他是用来和uWSGI进行通讯的
uwsgi_connect_timeout 30; # 设置连接uWSGI超时时间
uwsgi_pass unix:/data/pyproject/zc1024/tmp/uwsgi.sock; # 指定uwsgi的sock文件所有动态请求就会直接丢给他
}

# 指定静态文件路径
location /static/ {
alias /data/pyproject/zc1024/static/;
index index.html index.htm;
}

重新加载nginx配置

nginx -s reload

Django CMS比较

Mezzanine is a powerful, consistent, and flexible content management platform. Built using the Django framework, Mezzanine provides a simple yet highly extensible architecture that encourages diving in and hacking on the code. Mezzanine is BSD licensed and supported by a diverse and active community.

In some ways, Mezzanine resembles tools such as WordPress, providing an intuitive interface for managing pages, blog posts, form data, store products, and other types of content. But Mezzanine is also different. Unlike many other platforms that make extensive use of modules or reusable applications, Mezzanine provides most of its functionality by default. This approach yields a more integrated and efficient platform.

Django Fiber

Django Fiber – a simple, open-source, user-friendly CMS for all your django projects. It complements your project, it doesn’t take it over. It allows you to create simple textual, template based pages, add simple content items in pages and views, and adds simple menus that always work. All this can be maintained by a friendly frontend admin.

django CMS

Enterprise content management with django

Django-Fluent

A smooth, flexible CMS to create the designs you like, built on top of the powerful Django framework.

Django-Fluent CMS is a Open Source CMS, designed for the following needs:

Shape the CMS according to the client’s needs.
Make any kind of design editable for end-users.
Be easy to use
Be easy to code with -Be usable for small up to large sites

The CMS is flexible for your own needs. You can install parts of the CMS you like to use, and leave everything else out of your project. The system is designed to perform well (blocks are cached in memcache), models and admin screens can be modified to your needs easily.

FeinCMS

One of the most advanced Content Management Systems built on Django

Wagtail

There are plenty of great open source content management systems. We’ve used Drupal very successfully on big sites for clients including high profile campaigning NGOs, fundraising charities, think tanks, universities and public sector organisations. There are also some excellent Django CMSs, including Mezzanine, Fein and Django CMS, with thriving developer communities and impressive case studies.

But having built content-managed websites for 14 years we have strong opinions about the editor experience and how a CMS should work and be structured, and we need to manage a more rapid pace of development than we can achieve by contributing to existing projects.

python+selenium自动登录淘宝网并获取订单数据

本文将介绍使用python3+ selenium自动登录淘宝，并获取订单信息。

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.action_chains import ActionChains

import unittest, time, re


class Login(unittest.TestCase):
    def setUp(self):
        # binary = FirefoxBinary(r'C:\Program Files\Mozilla Firefox\firefox.exe')
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.driver.maximize_window()
        self.base_url = "https://www.taobao.com/"
        self.verificationErrors = []
        self.accept_next_alert = True

    def test_login(self):
        driver = self.driver
        # 打开登录页面
        driver.get("https://login.taobao.com/member/login.jhtml?&redirectURL=http%3A%2F%2Fwww.taobao.com%2F")
        self.driver.implicitly_wait(5)
        # 点击账号密码登录
        driver.find_element_by_id('J_Quick2Static').click()
        self.driver.implicitly_wait(2)
        # 输入账号密码
        driver.find_element_by_id("TPL_username_1").clear()
        driver.find_element_by_id("TPL_username_1").send_keys("username")
        driver.find_element_by_id("TPL_password_1").clear()
        driver.find_element_by_id("TPL_password_1").send_keys("passwd")
        driver.implicitly_wait(5)
        # 判断是否显示滑块，如果显示则拖动滑块
        # 提交表单
        driver.find_element_by_id("J_SubmitStatic").click()
        while True:
            if driver.find_element_by_link_text("卖家中心").is_displayed():
                break
        # 查看已卖出的宝贝
        driver.execute_script('window.location="https://trade.taobao.com/trade/detail/trade_order_detail.htm?biz_order_id=xxxxxx"')
        driver.implicitly_wait(5)
        driver.find_element_by_link_text("收货和物流信息").click()
        print(driver.page_source)


    def tearDown(self):
        # self.driver.quit()
        self.assertEqual([], self.verificationErrors)


if __name__ == "__main__":
    unittest.main()

fedora27+python3+virtualenv+virtualenvwrapper安装及使用

安装virtualenvwrapper

这里使用pip3进行安装即可

sudo pip3 install virtualenvwrapper

设置virtualenvwrapper的运行环境变量

编辑~/.bashrc，加入以下几行

VIRTUALENVWRAPPER_PYTHON=/usr/bin/python3 
export WORKON_HOME='~/.virtualenvs'
source /usr/local/bin/virtualenvwrapper.sh

使环境变量马上生效：

source ~/.bashrc

virtualenvwrapper使用

mkvirtualenv 新建虚拟环境

mkvirtualenv env1

建立后可以看到终端会以env1开头

(env1) [ning@localhost]$

再建立一个env2

mkvirtualenv env2

workon 启动/切换虚拟环境

workon env1

deactivate 离开虚拟环境

deactivate

rmvirtualenv 删除虚拟环境

rmvirtualenv env2

virtualenvwrapper help 查看virtualenvwrapper使用帮助

virtualenvwrapper is a set of extensions to Ian Bicking's virtualenv
tool.  The extensions include wrappers for creating and deleting
virtual environments and otherwise managing your development workflow,
making it easier to work on more than one project at a time without
introducing conflicts in their dependencies.

For more information please refer to the documentation:

    http://virtualenvwrapper.readthedocs.org/en/latest/command_ref.html

Commands available:

  add2virtualenv: add directory to the import path

  allvirtualenv: run a command in all virtualenvs

  cdproject: change directory to the active project

  cdsitepackages: change to the site-packages directory

  cdvirtualenv: change to the $VIRTUAL_ENV directory

  cpvirtualenv: duplicate the named virtualenv to make a new one

  lssitepackages: list contents of the site-packages directory

  lsvirtualenv: list virtualenvs

  mkproject: create a new project directory and its associated virtualenv

  mktmpenv: create a temporary virtualenv

  mkvirtualenv: Create a new virtualenv in $WORKON_HOME

  rmvirtualenv: Remove a virtualenv

  setvirtualenvproject: associate a project directory with a virtualenv

  showvirtualenv: show details of a single virtualenv

  toggleglobalsitepackages: turn access to global site-packages on/off

  virtualenvwrapper: show this help message

  wipeenv: remove all packages installed in the current virtualenv

  workon: list or change working virtualenvs

virtualenv –help 帮助

Usage: virtualenv [OPTIONS] DEST_DIR

Options:
  --version             show program's version number and exit
  -h, --help            show this help message and exit
  -v, --verbose         Increase verbosity.
  -q, --quiet           Decrease verbosity.
  -p PYTHON_EXE, --python=PYTHON_EXE
                        The Python interpreter to use, e.g.,
                        --python=python2.5 will use the python2.5 interpreter
                        to create the new environment.  The default is the
                        interpreter that virtualenv was installed with
                        (/usr/bin/python3)
  --clear               Clear out the non-root install and start from scratch.
  --no-site-packages    DEPRECATED. Retained only for backward compatibility.
                        Not having access to global site-packages is now the
                        default behavior.
  --system-site-packages
                        Give the virtual environment access to the global
                        site-packages.
  --always-copy         Always copy files rather than symlinking.
  --unzip-setuptools    Unzip Setuptools when installing it.
  --relocatable         Make an EXISTING virtualenv environment relocatable.
                        This fixes up scripts and makes all .pth files
                        relative.
  --no-setuptools       Do not install setuptools in the new virtualenv.
  --no-pip              Do not install pip in the new virtualenv.
  --no-wheel            Do not install wheel in the new virtualenv.
  --extra-search-dir=DIR
                        Directory to look for setuptools/pip distributions in.
                        This option can be used multiple times.
  --download            Download preinstalled packages from PyPI.
  --no-download, --never-download
                        Do not download preinstalled packages from PyPI.
  --prompt=PROMPT       Provides an alternative prompt prefix for this
                        environment.
  --setuptools          DEPRECATED. Retained only for backward compatibility.
                        This option has no effect.
  --distribute          DEPRECATED. Retained only for backward compatibility.
                        This option has no effect.