分类目录归档:Tools

使用pdfbox给pdf去背景图片

前面采用了Python写的pdfrw做的,发现用acrobat不能编辑。

用pdfbox工具查看发现missing xobject。

java -jar pdfbox-app-2.0.13.jar PDFDebugger out.pdf

所以改用java的pdfbox库来写

package com.c4ys;

import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;

public class Main {

    public static void main(String[] args) throws IOException {
        if (args.length != 3) {
            usage();
        } else {
            PDDocument doc = PDDocument.load(new File(args[0]));
            if (doc.isEncrypted()) {
                System.err.println(
                        "Error: Encrypted documents are not supported .");
                System.exit(1);
            }

            for (PDPage page : doc.getPages()) {
                List<Object> newTokens = createTokensWithoutImage(page, args[2]);
                PDStream newContents = new PDStream(doc);
                writeTokensToStream(newContents, newTokens);
                page.setContents(newContents);
                processResources(page.getResources(), args[2]);
            }

            doc.save(args[1]);
            doc.close();
        }
    }

    private static List<Object> createTokensWithoutImage(PDContentStream contentStream, String im) throws IOException {
        PDFStreamParser parser = new PDFStreamParser(contentStream);
        Object token = parser.parseNextToken();
        List<Object> newTokens = new ArrayList<Object>();
        while (token != null) {
            if (token instanceof Operator) {
                Operator op = (Operator) token;
                if (op.getName().equalsIgnoreCase("do")) {
                    COSName previous = (COSName) newTokens.get(newTokens.size() - 1);
                    System.out.println(previous.getName());
                    if (previous.getName().equalsIgnoreCase(im)) {
                        // remove the argument to this operator
                        newTokens.remove(newTokens.size() - 1);
                        token = parser.parseNextToken();
                        continue;
                    }
                }
            }
            newTokens.add(token);
            token = parser.parseNextToken();
        }
        return newTokens;
    }


    private static void processResources(PDResources resources, String im) throws IOException {
        for (COSName name : resources.getXObjectNames()) {
            PDXObject xobject = resources.getXObject(name);
            if (xobject instanceof PDFormXObject) {
                PDFormXObject formXObject = (PDFormXObject) xobject;
                writeTokensToStream(formXObject.getContentStream(),
                        createTokensWithoutImage(formXObject, im));
                processResources(formXObject.getResources(), im);
            }
        }
        for (COSName name : resources.getPatternNames()) {
            PDAbstractPattern pattern = resources.getPattern(name);
            if (pattern instanceof PDTilingPattern) {
                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
                writeTokensToStream(tilingPattern.getContentStream(),
                        createTokensWithoutImage(tilingPattern, im));
                processResources(tilingPattern.getResources(), im);
            }
        }
    }

    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
        OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter writer = new ContentStreamWriter(out);
        writer.writeTokens(newTokens);
        out.close();
    }


    /**
     * This will print the usage for this document.
     */
    private static void usage() {
        System.err.println("Usage: java " + Main.class.getName() + " <input-pdf> <output-pdf> <image-object-name>");
    }
}

HTML转为PDF的两种方案(含nodejs、PHP以及Python三种实现代码)

采用chrome headless方案

为什么要采用Chrome headless

因为wkhtmltopdf内置的为qt的webkit,已经很久不更新了,很多css3以及html5都支持不友好。

Chrome官方提供的页面转换为PDF的接口

https://chromedevtools.github.io/devtools-protocol/tot/Page#method-printToPDF

命令行方式

chrome --headless --print-to-pdf=path/to/file.pdf https://example.com

参考:HTML to PDF conversion using Chrome pdfium?

NodeJS扩展

html-pdf-chrome HTML to PDF converter via Chrome/Chromium.

PHP扩展

chrome-html-to-pdf Converts HTML to PDF using Google Chrome

Chrome命令行参数列表

List of Chromium Command Line Switches

采用Qt的Webkit(PyQt5)

由于当前的chrome转换存在BUG,转换大文件时内存消耗特别大,生成的文件也比较大,测试了10多种方法后,最后决定采用PyQt5来做

#!/usr/bin/env python3

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

 采用qt打印

import sys
import argparse

from PyQt5.QtCore import QUrl, QMarginsF
from PyQt5.QtGui import QPageLayout, QPageSize
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage, QWebEngineProfile
from PyQt5.QtWidgets import QApplication
from PyQt5.QtPrintSupport import QPrinter, QPrintDialog


class PrinterView(QWebEngineView):
    def __init__(self, url, filename, do_preview, parent=None):
        self.printer = QPrinter()
        self.printer.setPageSize(QPrinter.A5)
        self.printer.setOrientation(QPrinter.Portrait)
        self.printer.setOutputFormat(QPrinter.PdfFormat)
        self.printer.setOutputFileName(filename)
        self.printer.setPageMargins(0, 0, 0, 0, QPrinter.Millimeter)
        super(PrinterView, self).__init__(parent)
        self.do_preview = do_preview
        self.page().profile().setHttpCacheMaximumSize(5 * 1024 * 1024 * 1024)
        self.page().profile().setHttpCacheType(QWebEngineProfile.MemoryHttpCache)
        self.setUrl(QUrl(url))
        self.setZoomFactor(1)
        self.loadFinished.connect(self.load_finished2)
        self.filename = filename

    def load_finished(self):
        if self.do_preview:
            self.show()
        else:
            pageLayout = QPageLayout(QPageSize(QPageSize.A5), QPageLayout.Portrait,
                                     QMarginsF(0, 0, 0, 0))
            self.page().printToPdf(self.filename, pageLayout)
            self.page().pdfPrintingFinished.connect(on_pdf_finished)

    def load_finished2(self):
        self.show()
        self.page().print(self.printer, on_pdf_finished)


def on_pdf_finished(result):
    if result:
        print(result)
        QApplication.exit()
    else:
        QApplication.exit(1)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", "-i", help="Input URL (http://example.com, file:///home/user/example.html, ...)",
                        required=True)
    parser.add_argument("--output", "-o", help="Write pdf to this file", required=True)
    parser.add_argument("--preview", "-p", help="Open preview", action="store_true")
    args = parser.parse_args()
    a = PrinterView(args.url, args.output, args.preview)
    sys.exit(app.exec_())

使用firefox的pdf

slimer-html-pdf – convert any HTML document to PDF format using slimerjs (Gecko)

大文件合并(Python)

 def on_pdf_finished(self, result):
        if result:
            print(result + ', total ' + str(self.total))
        else:
            print("导出失败")
        self.printed = self.printed + 1
        print('导出第', self.printed, '本')
        if self.printed < self.total:
            self.print_book()
        else:
            print('开始合并')
            merger = PdfFileMerger()
            for index in range(0, self.total):
                filepath = self.filename + '.' + str(index) + '.pdf'
                merger.append(filepath)
                print('合并第', index, '本')
            merger.write(self.filename)
            merger.close()
            print('合并完成,开始清除临时文件')
            # for index in range(0, self.total):
            #     filepath = self.filename + '.' + str(index) + '.pdf'
            #     os.remove(filepath)
            print('清除临时文件完成')
            QApplication.exit()

HTML转markdown工具比较

Html2MarkDown – aTool在线工具

一款html和markdown标签互转的工具,直接输入Html,网页会自动帮你转换。

HTML2Markdown

Javascript Implementation for converting HTML to Markdown text.

html2markdown

Javascript implementation for converting HTML to Markdown text. Browser and Node.js support.

Turndown

Convert HTML into Markdown with JavaScript.

Python 新轮子 Tomd: HTML 转 Markdown 工具库

用途: 爬虫爬文章保存到本地为 Markdown 格式

LCTT选题工具

将内容复制到左侧输入框内,点击生成MD,在中部编辑器处进行二次修改,并在右侧的预览框中查看效果。 确认无误后点击上方的复制代码按钮即可将代码复制到剪贴板中!

html-to-markdown

An HTML-to-markdown conversion helper for PHP

Markdown Navigator 2.0

Markdown language support for IntelliJ platform

django2 + uwsgi + nginx

安装uwsgi模块

pip install uwsgi

测试uwsgi服务

uwsgi --http 0.0.0.0:8080 --file project/wsgi.py --static-map=/static=static

配置uwsgi.ini

# uwsig使用配置文件启动
[uwsgi]
# 项目目录
chdir=/data/pyproject/zc1024
# 指定项目的application
module=zc1024.wsgi:application
# 指定sock的文件路径
socket=/data/pyproject/zc1024/tmp/uwsgi.sock
# 进程个数
workers=4
pidfile=/data/pyproject/zc1024/tmp/uwsgi.pid
# 指定IP端口
http=127.0.0.1:8080
# 指定静态文件
static-map=/static=/data/pyproject/zc1024/static
# 启动uwsgi的用户名和用户组
uid=ning
gid=ning
# 启用主进程
master=true
# 自动移除unix Socket和pid文件当服务停止的时候
vacuum=true
# 序列化接受的内容,如果可能的话
thunder-lock=true
# 启用线程
enable-threads=true
# 设置自中断时间
harakiri=30
# 设置缓冲
post-buffering=4096
# 设置日志目录
daemonize=/data/pyproject/zc1024/tmp/uwsgi.log

运行配置

uwsgi --ini uwsgi.ini

配置nginx

 # 指定项目路径uwsgi
location / { # 这个location就和咱们Django的url(r'^admin/', admin.site.urls),
include uwsgi_params; # 导入一个Nginx模块他是用来和uWSGI进行通讯的
uwsgi_connect_timeout 30; # 设置连接uWSGI超时时间
uwsgi_pass unix:/data/pyproject/zc1024/tmp/uwsgi.sock; # 指定uwsgi的sock文件所有动态请求就会直接丢给他
}

# 指定静态文件路径
location /static/ {
alias /data/pyproject/zc1024/static/;
index index.html index.htm;
}

重新加载nginx配置

nginx -s reload

https中Mixed Content解决办法

问题描述

HTTPS页面里动态的引入HTTP资源,比如引入一个js文件,会被直接block掉的.在HTTPS页面里通过AJAX的方式请求HTTP资源,也会被直接block掉的。

解决办法

可以在相应的页面的里加上这句代码,意思是自动将http的不安全请求升级为https

<meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">

nginx出现blocked for more than 120 seconds 以及 hung_task_timeout_secs错误解决办法

问题原因

默认情况下, Linux 会最多使用 40% 的可用内存作为文件系统缓存。当超过这个阈值后,文件系统会把将缓存中的内存全部写入磁盘, 导致后续的 IO 请求都是同步的。

将缓存写入磁盘时,有一个默认120 秒的超时时间。 出现上面的问题的原因是 IO 子系统的处理速度不够快,不能在 120 秒将缓存中的数据全部写入磁盘。

IO 系统响应缓慢,导致越来越多的请求堆积,最终系统内存全部被占用,导致系统失去响应。

解决办法

根据应用程序情况,对 vm.dirty_ratio,vm.dirty_background_ratio 两个参数进行调优设置。 例如,推荐如下设置:

# sysctl -w vm.dirty_ratio=10
# sysctl -w vm.dirty_background_ratio=5
# sysctl -p

如果系统永久生效,修改 /etc/sysctl.conf 文件。加入如下两行:

#vi /etc/sysctl.conf 
vm.dirty_background_ratio = 5
vm.dirty_ratio = 10

重启系统生效。

Vagrant box国内镜像及本地安装教程

Vagrant是非常好的本地开发环境搭建工具。通常使用官方下载都会比较慢,而国内box下载地址较少,所以我特别下载了几个传到百度网盘。

Vagrant box的百度网盘下载地址

Vagrant box Ubuntu 16.04 百度网盘下载地址

https://pan.baidu.com/s/1kVlAz59

Vagrant box Centos 7 百度网盘下载地址

http://pan.baidu.com/s/1gfNCud1

Vagrant box Debian 8 百度网盘下载地址

http://pan.baidu.com/s/1mhAuONu

下载后的使用方法

添加vagrant box到box list

vagrant box add centos7 CentOS-7.box

初始化一个虚拟机使用刚才添加的vagrant box

vagrant init centos7

启动vagrant box虚拟机

vagrant up

使用Vagrant+CentOS 7搭建PHP7开发环境(含centos7.box直接下载地址)

Vagrant是一款基于命令行的虚拟机管理软件,可以用来快速部署统一的开发环境。

下载Vagrant

https://www.vagrantup.com/downloads.html

下载CentOS 7 Box

官方box下载地址

https://app.vagrantup.com/boxes/search

第三方box下载地址

http://www.vagrantbox.es/

使用原生下载

https://app.vagrantup.com/centos/boxes/7

vagrant init centos/7
vagrant up

vagrant 配置

  config.vm.network "public_network", ip: "192.168.31.245"
  config.vm.synced_folder "d:/data", "/data"
  config.vm.synced_folder "d:/phpcode", "/phpcode"

通过下载工具下载centos 7 box

官方box文件下载地址:http://cloud.centos.org/centos/7/vagrant/x86_64/images/

百度网盘box文件下载地址

  • CentOS 7: https://pan.baidu.com/s/1kVlAz59

添加并运行box

vagrant box add centos7 CentOS-7.box
vagrant init centos7
vagrant up

基础系统安装

基本系统安装

vagrant ssh
sudo passwd vagrant
sudo yum groupinstall "Development tools" -y
sudo yum install vim gcc kernel-devel kenel-devel-`uname -r`

禁止selinux

sudo setenforce 0 

sudo vi /etc/selinux/config

SELINUX=disabled

停止防火墙

sudo systemctl disable firewalld
sudo systemctl stop firewalld

更新系统

sudo yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
sudo yum install http://rpms.remirepo.net/enterprise/remi-release-7.rpm
sudo yum install yum-utils
sudo yum-config-manager --enable remi-php72
sudo yum update
sudo yum install php-gd php-pdo php-opcache php-fpm php-pecl-redis php-pecl-mysql php-pecl-mysql php-mbstring php-intl php-cli php-xml
sudo yum install nginx -y
sudo yum install mariadb mariadb-server -y

修改nginx配置

mkdir /data/log/nginx /data/run/nginx -p
sudo service nginx stop 
sudo vim /etc/nginx/nginx.conf

nginx.conf配置修改如下:

user vagrant;
error_log /data/log/nginx/error.log;
pid /data/run/nginx/nginx.pid;

access_log  /data/log/nginx/access.log  main;

include /data/phpcode/projectname/vagrant/nginx/app.conf;

测试nginx配置

sudo nginx -t

修改nginx service配置:

sudo vim /usr/lib/systemd/system/nginx.service

nginx.service修改内容如下:

[Service]
PIDFile=/data/run/nginx/nginx.pid

重新加载service

sudo systemctl daemon-reload
sudo systemctl start nginx

修改PHP配置

mkdir  /data/run/php-fpm/session /data/run/php-fpm/wsdlcache /data/run/php-fpm/opcache /data/log/php-fpm/ -p
sudo service php-fpm stop
sudo vim /etc/php-fpm.d/www.conf

配置内容

user = vagrant
group = vagrant
php_value[session.save_path]    = /data/run/php-fpm/session
php_value[soap.wsdl_cache_dir]  = /data/run/php-fpm/wsdlcache
php_value[opcache.file_cache]  = /data/run/php-fpm/opcache
php_admin_value[error_log] = /data/log/php-fpm/www-error.log
slowlog = /data/log/php-fpm/www-slow.log
request_slowlog_timeout = 1

重启

sudo service php-fpm stop

配置Mysql

mkdir /data/mysql /data/run/mariadb /data/log/mariadb -p
sudo service mariadb stop
sudo vim /etc/my.cnf

mysqld配置

[mysqld]
datadir=/data/mysql
socket=/usr/lib/mysql/mysql.sock

[mysqld_safe]
log-error=/data/log/mariadb/mariadb.log
pid-file=/data/run/mariadb/mariadb.pid

mysql client 配置

[client]

初始化数据库

sudo /usr/libexec/mariadb-prepare-db-dir mariadb.service

修改systemd配置

sudo vim /usr/lib/systemd/system/mariadb.service

配置内容

User=vagrant
Group=vagrant

重载systemd

sudo systemctl daemon-reload
sudo systemctl start mariadb

修改mysql账号密码

'/usr/bin/mysqladmin' -u root password 'new-password'
'/usr/bin/mysqladmin' -u root -h localhost.localdomain password 'new-password'

# Alternatively you can run:
'/usr/bin/mysql_secure_installation'

配置composer

下载安装文件

php -r "copy('https://install.phpcomposer.com/installer', 'composer-setup.php');"

安装

php composer-setup.php

删除安装文件

php -r "unlink('composer-setup.php');"

设置全局路径(windows请按参考文档设置)

sudo mv composer.phar /usr/bin/composer

配置使用国内镜像

composer config -g repo.packagist composer https://packagist.phpcomposer.com