分类目录归档:Python

Python进入交互模式4种方法

使用python -i参数

添加-i参数

python -i myapp.py

使用code.interact()

    import code
    # before
    code.interact()
    # after

使用pdb lib

for thing in set_of_things:
    import pdb;
    pdb.set_trace()
    do_stuff_to(thing)

使用IPython embed

from IPython import embed

for thing in set_of_things:
  embed()
  do_stuff_to(thing)

Python 修改PDF文档尺寸以及去除水印图片

# -*- coding: UTF-8 -*-

import sys
import os

from pdfrw import PageMerge, PdfReader, PdfWriter, IndirectPdfDict
import fitz


# resize
def adjust(page):
    info = PageMerge().add(page)
    x1, y1, x2, y2 = info.xobj_box
    viewrect = ((x2 - 421) / 2, (y2 - 595) / 2, 421, 595)
    page = PageMerge().add(page, viewrect=viewrect)
    return page.render()

fin, = sys.argv[1:]
fout = 'mid.' + os.path.basename(fin)
reader = PdfReader(fin)
writer = PdfWriter(fout)
for p in reader.pages:
    writer.addpage(adjust(p))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write()

# trip backgroud images
doc = fitz.open(fout)
for i in range(len(doc)):
    imglist = doc.getPageImageList(i)
    for img in imglist:
        xref = img[0]
        if xref==51:
            doc._deleteObject(xref)
        print(img)
doc.save('new.' + os.path.basename(fin))

Python通用logging方法:控制台输出以及文件自动切分

import logging
from logging.handlers import TimedRotatingFileHandler

import sys

def get_logger():
    logger = logging.getLogger()
    hd = logging.StreamHandler(sys.stdout)
    # hd = TimedRotatingFileHandler('mylog.log', when='D',maxBytes=10240000 backupCount=30)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    hd.setFormatter(formatter)
    logger.addHandler(hd)
    logger.setLevel(logging.DEBUG)
    return logger

django rest framework token验证指南

安装app

修改settings.py增加rest_framework.authtoken

INSTALLED_APPS = (
    'rest_framework',
    'rest_framework.authtoken',
    'myapp',
)

增加权限验证

REST_FRAMEWORK = {
    'DEFAULT_PERMISSION_CLASSES': (
        'rest_framework.permissions.IsAdminUser',
    ),
    'DEFAULT_AUTHENTICATION_CLASSES': (
        'rest_framework.authentication.SessionAuthentication',
        'rest_framework.authentication.TokenAuthentication',
    )
}

升级表可以看到增加了authtoken_token表

python manage.py migrate

添加url

编辑urls.py

from rest_framework.authtoken import views

urlpatterns = [
    path('admin/', admin.site.urls),
    url(r'^', include(router.urls)),
    url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
    url(r'^api-token-auth/', views.obtain_auth_token),
]

获取token

http POST 127.0.0.1:8000/api-token-auth/ username='admin' password='password'

{
    "token": "9d1ff379e5e380c143ceadb66dde26b2b09dd4ab"
}

查看验证

http GET 127.0.0.1:8000/users/

{
    "detail": "身份认证信息未提供。"
}

http GET 127.0.0.1:8000/users/ 'Authorization:Token 9d1ff379e5e380c143ceadb66dde26b2b09dd4ab'

{
    "count": 1,
    "next": null,
    "previous": null,
    "results": [
        {
            "date_joined": "2018-09-12T22:46:35.919532+08:00",
            "id": 1,
            "url": "http://127.0.0.1:8000/users/1/",
            "username": "admin"
        }
    ]
}

参考

使用Splash解决爬取页面时需要执行JS问题

写 Scrapy 爬虫时,遇到了 js 进行跳转的页面,大家有没有好的解决方法?

答案是:

splash

Splash is a javascript rendering service with an HTTP API. It's a lightweight browser with an HTTP API, implemented in Python 3 using Twisted and QT5.

It's fast, lightweight and state-less which makes it easy to distribute.

Documentation Documentation is available here: https://splash.readthedocs.io/

scrapy-splash

This library provides Scrapy and JavaScript integration using Splash. The license is BSD 3-clause.

参考:

HTML转为PDF的两种方案(含nodejs、PHP以及Python三种实现代码)

采用chrome headless方案

为什么要采用Chrome headless

因为wkhtmltopdf内置的为qt的webkit,已经很久不更新了,很多css3以及html5都支持不友好。

Chrome官方提供的页面转换为PDF的接口

https://chromedevtools.github.io/devtools-protocol/tot/Page#method-printToPDF

命令行方式

chrome --headless --print-to-pdf=path/to/file.pdf https://example.com

参考:HTML to PDF conversion using Chrome pdfium?

NodeJS扩展

html-pdf-chrome HTML to PDF converter via Chrome/Chromium.

PHP扩展

chrome-html-to-pdf Converts HTML to PDF using Google Chrome

Chrome命令行参数列表

List of Chromium Command Line Switches

采用Qt的Webkit(PyQt5)

由于当前的chrome转换存在BUG,转换大文件时内存消耗特别大,生成的文件也比较大,测试了10多种方法后,最后决定采用PyQt5来做

#!/usr/bin/env python3

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

 采用qt打印

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineProfile
from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter, QPrintDialog


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        self.printer = QPrinter()
        self.printer.setPageSize(QPrinter.A5)
        self.printer.setOrientation(QPrinter.Portrait)
        self.printer.setOutputFormat(QPrinter.PdfFormat)
        self.printer.setOutputFileName(filename)
        self.printer.setPageMargins(0, 0, 0, 0, QPrinter.Millimeter)
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.page().profile().setHttpCacheMaximumSize(5 * 1024 * 1024 * 1024)
        self.page().profile().setHttpCacheType(QWebEngineProfile.MemoryHttpCache)
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished2)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)

    def load_finished2(self):
        self.show()
        self.page().print(self.printer, on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

使用firefox的pdf

slimer-html-pdf - convert any HTML document to PDF format using slimerjs (Gecko)

大文件合并(Python)

 def on_pdf_finished(self, result):
        if result:
            print(result + ', total ' + str(self.total))
        else:
            print("导出失败")
        self.printed = self.printed + 1
        print('导出第', self.printed, '本')
        if self.printed < self.total:
            self.print_book()
        else:
            print('开始合并')
            merger = PdfFileMerger()
            for index in range(0, self.total):
                filepath = self.filename + '.' + str(index) + '.pdf'
                merger.append(filepath)
                print('合并第', index, '本')
            merger.write(self.filename)
            merger.close()
            print('合并完成,开始清除临时文件')
            # for index in range(0, self.total):
            #     filepath = self.filename + '.' + str(index) + '.pdf'
            #     os.remove(filepath)
            print('清除临时文件完成')
            QApplication.exit()

django2 + uwsgi + nginx

安装uwsgi模块

pip install uwsgi

测试uwsgi服务

uwsgi --http 0.0.0.0:8080 --file project/wsgi.py --static-map=/static=static

配置uwsgi.ini

# uwsig使用配置文件启动
[uwsgi]
# 项目目录
chdir=/data/pyproject/zc1024
# 指定项目的application
module=zc1024.wsgi:application
# 指定sock的文件路径
socket=/data/pyproject/zc1024/tmp/uwsgi.sock
# 进程个数
workers=4
pidfile=/data/pyproject/zc1024/tmp/uwsgi.pid
# 指定IP端口
http=127.0.0.1:8080
# 指定静态文件
static-map=/static=/data/pyproject/zc1024/static
# 启动uwsgi的用户名和用户组
uid=ning
gid=ning
# 启用主进程
master=true
# 自动移除unix Socket和pid文件当服务停止的时候
vacuum=true
# 序列化接受的内容,如果可能的话
thunder-lock=true
# 启用线程
enable-threads=true
# 设置自中断时间
harakiri=30
# 设置缓冲
post-buffering=4096
# 设置日志目录
daemonize=/data/pyproject/zc1024/tmp/uwsgi.log

运行配置

uwsgi --ini uwsgi.ini

配置nginx

 # 指定项目路径uwsgi
location / { # 这个location就和咱们Django的url(r'^admin/', admin.site.urls),
include uwsgi_params; # 导入一个Nginx模块他是用来和uWSGI进行通讯的
uwsgi_connect_timeout 30; # 设置连接uWSGI超时时间
uwsgi_pass unix:/data/pyproject/zc1024/tmp/uwsgi.sock; # 指定uwsgi的sock文件所有动态请求就会直接丢给他
}

# 指定静态文件路径
location /static/ {
alias /data/pyproject/zc1024/static/;
index index.html index.htm;
}

重新加载nginx配置

nginx -s reload

Django CMS比较

Mezzanine

Mezzanine is a powerful, consistent, and flexible content management platform. Built using the Django framework, Mezzanine provides a simple yet highly extensible architecture that encourages diving in and hacking on the code. Mezzanine is BSD licensed and supported by a diverse and active community.

In some ways, Mezzanine resembles tools such as WordPress, providing an intuitive interface for managing pages, blog posts, form data, store products, and other types of content. But Mezzanine is also different. Unlike many other platforms that make extensive use of modules or reusable applications, Mezzanine provides most of its functionality by default. This approach yields a more integrated and efficient platform.

Django Fiber

Django Fiber - a simple, open-source, user-friendly CMS for all your django projects. It complements your project, it doesn't take it over. It allows you to create simple textual, template based pages, add simple content items in pages and views, and adds simple menus that always work. All this can be maintained by a friendly frontend admin.

django CMS

Enterprise content management with django

Django-Fluent

A smooth, flexible CMS to create the designs you like, built on top of the powerful Django framework.

Django-Fluent CMS is a Open Source CMS, designed for the following needs:

  • Shape the CMS according to the client's needs.
  • Make any kind of design editable for end-users.
  • Be easy to use
  • Be easy to code with -Be usable for small up to large sites

The CMS is flexible for your own needs. You can install parts of the CMS you like to use, and leave everything else out of your project. The system is designed to perform well (blocks are cached in memcache), models and admin screens can be modified to your needs easily.

FeinCMS

One of the most advanced Content Management Systems built on Django

Wagtail

There are plenty of great open source content management systems. We've used Drupal very successfully on big sites for clients including high profile campaigning NGOs, fundraising charities, think tanks, universities and public sector organisations. There are also some excellent Django CMSs, including Mezzanine, Fein and Django CMS, with thriving developer communities and impressive case studies.

But having built content-managed websites for 14 years we have strong opinions about the editor experience and how a CMS should work and be structured, and we need to manage a more rapid pace of development than we can achieve by contributing to existing projects.

python+selenium自动登录淘宝网并获取订单数据

本文将介绍使用python3+ selenium自动登录淘宝,并获取订单信息。

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.action_chains import ActionChains

import unittest, time, re


class Login(unittest.TestCase):
    def setUp(self):
        # binary = FirefoxBinary(r'C:\Program Files\Mozilla Firefox\firefox.exe')
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.driver.maximize_window()
        self.base_url = "https://www.taobao.com/"
        self.verificationErrors = []
        self.accept_next_alert = True

    def test_login(self):
        driver = self.driver
        # 打开登录页面
        driver.get("https://login.taobao.com/member/login.jhtml?&redirectURL=http%3A%2F%2Fwww.taobao.com%2F")
        self.driver.implicitly_wait(5)
        # 点击账号密码登录
        driver.find_element_by_id('J_Quick2Static').click()
        self.driver.implicitly_wait(2)
        # 输入账号密码
        driver.find_element_by_id("TPL_username_1").clear()
        driver.find_element_by_id("TPL_username_1").send_keys("username")
        driver.find_element_by_id("TPL_password_1").clear()
        driver.find_element_by_id("TPL_password_1").send_keys("passwd")
        driver.implicitly_wait(5)
        # 判断是否显示滑块,如果显示则拖动滑块
        # 提交表单
        driver.find_element_by_id("J_SubmitStatic").click()
        while True:
            if driver.find_element_by_link_text("卖家中心").is_displayed():
                break
        # 查看已卖出的宝贝
        driver.execute_script('window.location="https://trade.taobao.com/trade/detail/trade_order_detail.htm?biz_order_id=xxxxxx"')
        driver.implicitly_wait(5)
        driver.find_element_by_link_text("收货和物流信息").click()
        print(driver.page_source)


    def tearDown(self):
        # self.driver.quit()
        self.assertEqual([], self.verificationErrors)


if __name__ == "__main__":
    unittest.main()