分类目录归档:Python

著作权代码收集工具

使用Python编写,可以设置源码目录,排除目录,收集文件类型,输出目标文件,是否允许空白,编码等。

使用示例:

python zhuzuoquan.py -s D:\pythoncode\projectname \
-e debug,release \
-t .h,.cpp \
-en gb18030 \
-o result.txt

zhuzuoquan.py 代码

#!/usr/bin/python
# -*- coding: utf-8 -*-

import argparse
import os

parser = argparse.ArgumentParser(
    description='著作权代码收集工具', usage='''
    zhuzuoquan.py -s 源码目录 -e 排除目录 -t 文件类型 -o 输出文件
    ''')
parser.add_argument('-s', help='源文件目录', required=True)
parser.add_argument('-e', help='排除目录,多个用半角逗号分割', )
parser.add_argument('-t', help='扩展名,多个用半角逗号分割', required=True)
parser.add_argument('-o', help='输出文件')
parser.add_argument('-el', help='允许空白行')
parser.add_argument('-en', help='编码', default='utf-8')

def filter_files(root_dir, excluded_dirs, allow_types):
    valid_files = []
    for root, dirs, files in os.walk(root_dir):

        excluded = False
        d = root[root_dir.__len__():]
        for ex in excluded_dirs:
            if d.startswith(ex):
                excluded = True
                continue
        if excluded:
            continue

        for f in files:
            allow = False
            for t in allow_types:
                if f.endswith(t):
                    allow = True
            if allow:
                filepath = os.path.join(root, f)
                valid_files.append(filepath)
    return valid_files

if __name__ == '__main__':
    args = parser.parse_args()

    if args.e:
        excluded_dirs = args.e.split(',')
    else:
        excluded_dirs = []

    allow_types = args.t.split(',')

    root_dir = args.s

    output_file = args.o

    allow_empty_line = args.el

    encoding = args.en

    files = filter_files(root_dir, excluded_dirs, allow_types)
    total_files = total_lines = 0
    if output_file:
        with open(output_file, encoding=encoding, mode='w') as op:
            op.write("")

    for f in files:
        total_files = total_files + 1
        try:
            with open(f, encoding=encoding) as fh:
                content = fh.read()
                if not allow_empty_line:
                    content = "\n".join([s for s in content.splitlines() if s.strip()])
                lines = len(content.splitlines())
                relative_path = f[root_dir.__len__():]
                print(relative_path, lines)
                total_lines = total_lines + lines
                if output_file:
                    fh.seek(0)
                    with open(output_file, encoding=encoding, mode='a+') as op:
                        op.write(relative_path + ":\n\n" + content + "\n\n\n")
        except:
            pass

    print("Total: files %d, lines %d" % (total_files, total_lines))

Matplotlib使用中文字体(linux)

查看系统已有中文字体

fc-list :lang=zh

查询matplotlib默认配置示例

import matplotlib
matplotlib.matplotlib_fname()

查看当前系统matplotlib配置文件路径

import matplotlib
matplotlib.get_configdir()

全局修改

在系统如今下新增matplotlibrc文件,然后修改font.sans-serif,添加中文字体

font.sans-serif : WenQuanYi Micro Hei, Bitstream Vera Sans, ...

只修改当前程序

import matplotlib
matplotlib.rcParams['font.sans-serif'] = 'WenQuanYi Micro Hei'

参考

Python进入交互模式4种方法

使用python -i参数

添加-i参数

python -i myapp.py

使用code.interact()

    import code
    # before
    code.interact()
    # after

使用pdb lib

for thing in set_of_things:
    import pdb;
    pdb.set_trace()
    do_stuff_to(thing)

使用IPython embed

from IPython import embed

for thing in set_of_things:
  embed()
  do_stuff_to(thing)

Python 修改PDF文档尺寸以及去除水印图片

# -*- coding: UTF-8 -*-

import sys
import os

from pdfrw import PageMerge, PdfReader, PdfWriter, IndirectPdfDict
import fitz


# resize
def adjust(page):
    info = PageMerge().add(page)
    x1, y1, x2, y2 = info.xobj_box
    viewrect = ((x2 - 421) / 2, (y2 - 595) / 2, 421, 595)
    page = PageMerge().add(page, viewrect=viewrect)
    return page.render()

fin, = sys.argv[1:]
fout = 'mid.' + os.path.basename(fin)
reader = PdfReader(fin)
writer = PdfWriter(fout)
for p in reader.pages:
    writer.addpage(adjust(p))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write()

# trip backgroud images
doc = fitz.open(fout)
for i in range(len(doc)):
    imglist = doc.getPageImageList(i)
    for img in imglist:
        xref = img[0]
        if xref==51:
            doc._deleteObject(xref)
        print(img)
doc.save('new.' + os.path.basename(fin))

Python通用logging方法:控制台输出以及文件自动切分

import logging
from logging.handlers import TimedRotatingFileHandler

import sys

def get_logger():
    logger = logging.getLogger()
    hd = logging.StreamHandler(sys.stdout)
    # hd = TimedRotatingFileHandler('mylog.log', when='D',maxBytes=10240000 backupCount=30)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    hd.setFormatter(formatter)
    logger.addHandler(hd)
    logger.setLevel(logging.DEBUG)
    return logger

django rest framework token验证指南

安装app

修改settings.py增加rest_framework.authtoken

INSTALLED_APPS = (
    'rest_framework',
    'rest_framework.authtoken',
    'myapp',
)

增加权限验证

REST_FRAMEWORK = {
    'DEFAULT_PERMISSION_CLASSES': (
        'rest_framework.permissions.IsAdminUser',
    ),
    'DEFAULT_AUTHENTICATION_CLASSES': (
        'rest_framework.authentication.SessionAuthentication',
        'rest_framework.authentication.TokenAuthentication',
    )
}

升级表可以看到增加了authtoken_token表

python manage.py migrate

添加url

编辑urls.py

from rest_framework.authtoken import views

urlpatterns = [
    path('admin/', admin.site.urls),
    url(r'^', include(router.urls)),
    url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
    url(r'^api-token-auth/', views.obtain_auth_token),
]

获取token

http POST 127.0.0.1:8000/api-token-auth/ username='admin' password='password'

{
    "token": "9d1ff379e5e380c143ceadb66dde26b2b09dd4ab"
}

查看验证

http GET 127.0.0.1:8000/users/

{
    "detail": "身份认证信息未提供。"
}

http GET 127.0.0.1:8000/users/ 'Authorization:Token 9d1ff379e5e380c143ceadb66dde26b2b09dd4ab'

{
    "count": 1,
    "next": null,
    "previous": null,
    "results": [
        {
            "date_joined": "2018-09-12T22:46:35.919532+08:00",
            "id": 1,
            "url": "http://127.0.0.1:8000/users/1/",
            "username": "admin"
        }
    ]
}

参考

使用Splash解决爬取页面时需要执行JS问题

写 Scrapy 爬虫时,遇到了 js 进行跳转的页面,大家有没有好的解决方法?

答案是:

splash

Splash is a javascript rendering service with an HTTP API. It's a lightweight browser with an HTTP API, implemented in Python 3 using Twisted and QT5.

It's fast, lightweight and state-less which makes it easy to distribute.

Documentation Documentation is available here: https://splash.readthedocs.io/

scrapy-splash

This library provides Scrapy and JavaScript integration using Splash. The license is BSD 3-clause.

参考:

HTML转为PDF的两种方案(含nodejs、PHP以及Python三种实现代码)

采用chrome headless方案

为什么要采用Chrome headless

因为wkhtmltopdf内置的为qt的webkit,已经很久不更新了,很多css3以及html5都支持不友好。

Chrome官方提供的页面转换为PDF的接口

https://chromedevtools.github.io/devtools-protocol/tot/Page#method-printToPDF

命令行方式

chrome --headless --print-to-pdf=path/to/file.pdf https://example.com

参考:HTML to PDF conversion using Chrome pdfium?

NodeJS扩展

html-pdf-chrome HTML to PDF converter via Chrome/Chromium.

PHP扩展

chrome-html-to-pdf Converts HTML to PDF using Google Chrome

Chrome命令行参数列表

List of Chromium Command Line Switches

采用Qt的Webkit(PyQt5)

由于当前的chrome转换存在BUG,转换大文件时内存消耗特别大,生成的文件也比较大,测试了10多种方法后,最后决定采用PyQt5来做

#!/usr/bin/env python3

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

 采用qt打印

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineProfile
from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter, QPrintDialog


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        self.printer = QPrinter()
        self.printer.setPageSize(QPrinter.A5)
        self.printer.setOrientation(QPrinter.Portrait)
        self.printer.setOutputFormat(QPrinter.PdfFormat)
        self.printer.setOutputFileName(filename)
        self.printer.setPageMargins(0, 0, 0, 0, QPrinter.Millimeter)
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.page().profile().setHttpCacheMaximumSize(5 * 1024 * 1024 * 1024)
        self.page().profile().setHttpCacheType(QWebEngineProfile.MemoryHttpCache)
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished2)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)

    def load_finished2(self):
        self.show()
        self.page().print(self.printer, on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

使用firefox的pdf

slimer-html-pdf - convert any HTML document to PDF format using slimerjs (Gecko)

大文件合并(Python)

 def on_pdf_finished(self, result):
        if result:
            print(result + ', total ' + str(self.total))
        else:
            print("导出失败")
        self.printed = self.printed + 1
        print('导出第', self.printed, '本')
        if self.printed < self.total:
            self.print_book()
        else:
            print('开始合并')
            merger = PdfFileMerger()
            for index in range(0, self.total):
                filepath = self.filename + '.' + str(index) + '.pdf'
                merger.append(filepath)
                print('合并第', index, '本')
            merger.write(self.filename)
            merger.close()
            print('合并完成,开始清除临时文件')
            # for index in range(0, self.total):
            #     filepath = self.filename + '.' + str(index) + '.pdf'
            #     os.remove(filepath)
            print('清除临时文件完成')
            QApplication.exit()

django2 + uwsgi + nginx

安装uwsgi模块

pip install uwsgi

测试uwsgi服务

uwsgi --http 0.0.0.0:8080 --file project/wsgi.py --static-map=/static=static

配置uwsgi.ini

# uwsig使用配置文件启动
[uwsgi]
# 项目目录
chdir=/data/pyproject/zc1024
# 指定项目的application
module=zc1024.wsgi:application
# 指定sock的文件路径
socket=/data/pyproject/zc1024/tmp/uwsgi.sock
# 进程个数
workers=4
pidfile=/data/pyproject/zc1024/tmp/uwsgi.pid
# 指定IP端口
http=127.0.0.1:8080
# 指定静态文件
static-map=/static=/data/pyproject/zc1024/static
# 启动uwsgi的用户名和用户组
uid=ning
gid=ning
# 启用主进程
master=true
# 自动移除unix Socket和pid文件当服务停止的时候
vacuum=true
# 序列化接受的内容,如果可能的话
thunder-lock=true
# 启用线程
enable-threads=true
# 设置自中断时间
harakiri=30
# 设置缓冲
post-buffering=4096
# 设置日志目录
daemonize=/data/pyproject/zc1024/tmp/uwsgi.log

运行配置

uwsgi --ini uwsgi.ini

配置nginx

 # 指定项目路径uwsgi
location / { # 这个location就和咱们Django的url(r'^admin/', admin.site.urls),
include uwsgi_params; # 导入一个Nginx模块他是用来和uWSGI进行通讯的
uwsgi_connect_timeout 30; # 设置连接uWSGI超时时间
uwsgi_pass unix:/data/pyproject/zc1024/tmp/uwsgi.sock; # 指定uwsgi的sock文件所有动态请求就会直接丢给他
}

# 指定静态文件路径
location /static/ {
alias /data/pyproject/zc1024/static/;
index index.html index.htm;
}

重新加载nginx配置

nginx -s reload