根据文章标题查找相近文章(PHP+结巴分词)

需求

根据当前文章标题,找到相近的10篇文章

测试案例

家里想买一台婴儿理发器,自己给小孩子理发,如何选购婴儿理发器

当前采用any模式进行匹配,匹配结果

家里想买一台彩色激光多功能一体机,什么牌子、什么型号的好?
我家里想买一台家用净水器,一个重庆的朋友给我推荐德国曼稣勒净水器,广东有德国曼稣勒净水器卖吗?
滚筒洗衣机有哪些品牌,家里想买一台滚筒洗衣机好不好,有什么好处
家里想买一台跑步机,有什么需要注意的地方吗
家里想买一台修鞋机,谁告诉我买什么牌子的好呢?
家里想买一台智能电视,长虹Q2F好吗?听说是首款移动互联电视呢。
我是搞婚庆的,想买一台高清摄像机自己用,请大家帮忙参考一下!价格在1W—2W,2W多点也不要紧。
家里想买一台跑步机,不知道什么牌子跑步机比较好
壁挂式新风系统有什么特点?最近家里想买一台新风机,由于家里已
家里想买一台婴儿理发器,自己给小孩子理发,如何选购婴儿理发器

改进后,匹配结果

运宝婴儿理发器怎样啊?好用吗?我想一个运宝理发器给小孩子用。
婴儿理发器哪个牌子好呢?一般大家都是怎么给小孩子理发的呢?怎么让小孩子不乱动呢?
家里想买一台婴儿理发器,自己给小孩子理发,如何选购婴儿理发器
婴儿理发器哪个好宝妈们有给宝贝买理发器吗?哪款更方便好用且静
什么牌子的充电式婴儿理发器好用又便宜 什么牌子的充电式婴儿理
婴儿理发器,家里有的来。 我想知道婴儿理发器,哪个牌子的那款
婴儿理发器想买一个好些的婴儿理发器,要静音,充电,陶瓷头,可
婴儿理发器什么牌子的好 宝宝现在5个多月了,想自己给宝贝理发
给婴儿用的电动理发器哪个牌子好?老公要自己给儿子理发,那就买
蓄电池能接“百特静音电动婴儿理发器”吗?

改进办法

采用结巴分词获取词性

家里/s  想买/v  一台/m  婴儿/n  理发器/n  ,/w  自己/r  给/p  小孩子/n  理发/v  ,/w  如何/r  选购/v  婴儿/n  理发器/n 

获取名词

婴儿 理发器 小孩子 婴儿 理发器

然后进行搜索

相关代码

<?php

use Fukuball\Jieba\Jieba;
use Fukuball\Jieba\Finalseg;
use Fukuball\Jieba\Posseg;
use Fukuball\Jieba\JiebaAnalyse;
use NilPortugues\Sphinx\SphinxClient;

Jieba::init();
Finalseg::init();
Posseg::init();
JiebaAnalyse::init();

$sphinxSearch_new = new SphinxClient();
$sphinxSearch_new->setServer($host, 9312);
$sphinxSearch_new->setMatchMode(SPH_MATCH_EXTENDED2);
$sphinxSearch_new->setRankingMode(SPH_RANK_EXPR, 'doc_word_count');
$sphinxSearch_new->setSortMode(SPH_SORT_EXTENDED, '@weight DESC, id desc');
$sphinxSearch_new->setLimits(0, 10);


$seg_list = Posseg::cut($v);
$words = array_map(function ($wd) {
    return $wd['word'] . "/" . $wd["tag"] . " ";
}, $seg_list);
$valid_words = array_filter($seg_list, function ($wd) {
    return in_array($wd["tag"], ['n', 'ng', 'nrt', 'ns', 'nt', 'nz', 'j', 'l', 'vn']);
});
$valid_words = array_map(function ($wd) {
    return $wd['word'];
}, $valid_words);
$valid_words_str = implode(" ", $valid_words);
$sphinxSearch_new->addQuery("\"$valid_words_str\"/2", 'wd_question');
$result = $sphinxSearch_new->runQueries();

使用php扩展性能测试

默认PHP版本会比较慢,所以采用php扩展(扩展启动慢,性好执行速度还行10000次0.5秒)

<?php
$time = microtime(true);
for($i=0;$i<10000;$i++) {
    $result = jieba('小明硕士毕业于中国科学院计算所,后在日本京都大学深造', 2);
}
$timenew = microtime(true);
echo "共耗时:" . ($timenew - $time) . PHP_EOL;

词性分析

https://github.com/fxsjy/jieba/issues/411

POS = {
    "n": {  # 1. 名词  (1个一类,7个二类,5个三类)
        "n": "名词",
        "nr": "人名",
        "nr1": "汉语姓氏",
        "nr2": "汉语名字",
        "nrj": "日语人名",
        "nrf": "音译人名",
        "ns": "地名",
        "nsf": "音译地名",
        "nt": "机构团体名",
        "nz": "其它专名",
        "nl": "名词性惯用语",
        "ng": "名词性语素"
    },
    "t": {  # 2. 时间词(1个一类,1个二类)
        "t": "时间词",
        "tg": "时间词性语素"
    },
    "s": {  # 3. 处所词(1个一类)
        "s": "处所词"
    },
    "f": {  # 4. 方位词(1个一类)
        "f": "方位词"
    },
    "v": {  # 5. 动词(1个一类,9个二类)
        "v": "动词",
        "vd": "副动词",
        "vn": "名动词",
        "vshi": "动词“是”",
        "vyou": "动词“有”",
        "vf": "趋向动词",
        "vx": "形式动词",
        "vi": "不及物动词(内动词)",
        "vl": "动词性惯用语",
        "vg": "动词性语素"
    },
    "a": {  # 6. 形容词(1个一类,4个二类)
        "a": "形容词",
        "ad": "副形词",
        "an": "名形词",
        "ag": "形容词性语素",
        "al": "形容词性惯用语"
    },
    "b": {  # 7. 区别词(1个一类,2个二类)
        "b": "区别词",
        "bl": "区别词性惯用语"
    },
    "z": {  # 8. 状态词(1个一类)
        "z": "状态词"
    },
    "r": {  # 9. 代词(1个一类,4个二类,6个三类)
        "r": "代词",
        "rr": "人称代词",
        "rz": "指示代词",
        "rzt": "时间指示代词",
        "rzs": "处所指示代词",
        "rzv": "谓词性指示代词",
        "ry": "疑问代词",
        "ryt": "时间疑问代词",
        "rys": "处所疑问代词",
        "ryv": "谓词性疑问代词",
        "rg": "代词性语素"
    },
    "m": {  # 10. 数词(1个一类,1个二类)
        "m": "数词",
        "mq": "数量词"
    },
    "q": {  # 11. 量词(1个一类,2个二类)
        "q": "量词",
        "qv": "动量词",
        "qt": "时量词"
    },
    "d": {  # 12. 副词(1个一类)
        "d": "副词"
    },
    "p": {  # 13. 介词(1个一类,2个二类)
        "p": "介词",
        "pba": "介词“把”",
        "pbei": "介词“被”"
    },
    "c": {  # 14. 连词(1个一类,1个二类)
        "c": "连词",
        "cc": "并列连词"
    },
    "u": {  # 15. 助词(1个一类,15个二类)
        "u": "助词",
        "uzhe": "着",
        "ule": "了 喽",
        "uguo": "过",
        "ude1": "的 底",
        "ude2": "地",
        "ude3": "得",
        "usuo": "所",
        "udeng": "等 等等 云云",
        "uyy": "一样 一般 似的 般",
        "udh": "的话",
        "uls": "来讲 来说 而言 说来",
        "uzhi": "之",
        "ulian": "连 "  # (“连小学生都会”)
    },
    "e": {  # 16. 叹词(1个一类)
        "e": "叹词"
    },
    "y": {  # 17. 语气词(1个一类)
        "y": "语气词(delete yg)"
    },
    "o": {  # 18. 拟声词(1个一类)
        "o": "拟声词"
    },
    "h": {  # 19. 前缀(1个一类)
        "h": "前缀"
    },
    "k": {  # 20. 后缀(1个一类)
        "k": "后缀"
    },
    "x": {  # 21. 字符串(1个一类,2个二类)
        "x": "字符串",
        "xx": "非语素字",
        "xu": "网址URL"
    },
    "w": {   # 22. 标点符号(1个一类,16个二类)
        "w": "标点符号",
        "wkz": "左括号",  # ( 〔  [  {  《 【  〖 〈   半角:( [ { <
        "wky": "右括号",  # ) 〕  ] } 》  】 〗 〉 半角: ) ] { >
        "wyz": "全角左引号",  # “ ‘ 『
        "wyy": "全角右引号",  # ” ’ 』
        "wj": "全角句号",  # 。
        "ww": "问号",  # 全角:? 半角:?
        "wt": "叹号",  # 全角:! 半角:!
        "wd": "逗号",  # 全角:, 半角:,
        "wf": "分号",  # 全角:; 半角: ;
        "wn": "顿号",  # 全角:、
        "wm": "冒号",  # 全角:: 半角: :
        "ws": "省略号",  # 全角:……  …
        "wp": "破折号",  # 全角:——   --   ——-   半角:---  ----
        "wb": "百分号千分号",  # 全角:% ‰   半角:%
        "wh": "单位符号"  # 全角:¥ $ £  °  ℃  半角:$
    }
}

取重点词

$words = [];
$seg_list = jieba($t, 2);
foreach ($seg_list as $k => $v) {
    $words[] = ['t' => $v, 'w' => $k];
}
$valid_words = [];
$stop_words = [',', ',', '.', '。', '!', '!', '?', '?', ' ', ' '];
$name_words = ['n', 'ng', 'nrt', 'nr', 'ns', 'nt', 'nz', 'j', 'vn'];
foreach ($words as $k => $v) {
    // 动词
    if ($v['t'] == 'v') {
        // 最后一位
        if (!isset($words[$k + 1])) {
            $valid_words[] = $v['w'];
            continue;
        }
        // 后面接标点符号
        if (isset($words[$k + 1]) && $words[$k + 1]['t'] == 'x'
            && in_array($words[$k + 1]['w'], $stop_words)
        ) {
            $valid_words[] = $v['w'];
            continue;
        }
    }
    // 未知词
    if ($v['t'] == 'x' && !in_array($v['w'], $stop_words)) {
        // 后面接名词
        if (isset($words[$k + 1]) && in_array($words[$k + 1]['t'], ['n', 'nr', 'v', 'uj'])) {
            $valid_words[] = $v['w'];
            continue;
        }
        // 接连词+名词
        if (isset($words[$k + 1]) && ($words[$k + 1]['t'] == 'p' || $words[$k + 1]['t'] == 'c')
            && isset($words[$k + 2]) && in_array($words[$k + 2]['t'], $name_words)
        ) {
            $valid_words[] = $v['w'];
            continue;
        }
    }
    // 名词,缩略语
    if (in_array($v['t'], $name_words)) {
        $valid_words[] = $v['w'];
    }
}
$valid_words_str = implode(" ", $valid_words);

使用php转换URL从相对路径到绝对路径

Transfrom relative path into absolute URL using PHP

function rel2abs($rel, $base)
{
    /* return if already absolute URL */
    if (parse_url($rel, PHP_URL_SCHEME) != '')
        return ($rel);

    /* queries and anchors */
    if ($rel[0] == '#' || $rel[0] == '?')
        return ($base . $rel);

    /* parse base URL and convert to local variables: $scheme, $host, $path, $query, $port, $user, $pass */
    extract(parse_url($base));

    /* remove non-directory element from path */
    $path = preg_replace('#/[^/]*$#', '', $path);

    /* destroy path if relative url points to root */
    if ($rel[0] == '/')
        $path = '';

    /* dirty absolute URL */
    $abs = '';

    /* do we have a user in our URL? */
    if (isset($user)) {
        $abs .= $user;

        /* password too? */
        if (isset($pass))
            $abs .= ':' . $pass;

        $abs .= '@';
    }

    $abs .= $host;

    /* did somebody sneak in a port? */
    if (isset($port))
        $abs .= ':' . $port;

    $abs .= $path . '/' . $rel . (isset($query) ? '?' . $query : '');

    /* replace '//' or '/./' or '/foo/../' with '/' */
    $re = ['#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'];
    for ($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {
    }

    /* absolute URL is ready! */

    return ($scheme . '://' . $abs);
}

sphinx-jieba试用笔记

实际使用中发现parse耗时很多,而且经常搜索不到词。弃坑。

下载并安装sphinx-jieba

$ git clone https://github.com/c4ys/sphinx-jieba
$ cd sphinx-jieba
$ git submodule update --init --recursive
$ sudo apt install gcc cmake automake g++
$ sudo apt install libmysqld-dev
$ ./configure --prefix=/usr/local/sphinx-jieba
$ cp cppjieba/include/cppjieba src/ -r
$ cp cppjieba/deps/limonp src/ -r
$ sudo make install

建立数据库

sql如下

CREATE TABLE documents ( id INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT, title VARCHAR(255) NOT NULL );

REPLACE INTO documents ( title ) VALUES
('广州狗场直销泰迪边牧阿拉斯加等各名犬 微信视频同步'),
('出售阿拉金毛拉多泰迪萨摩哈士奇等30多个品种 保健康可送货'),
('广州哪里买纯种哈士奇 雪橇犬哈士奇多少钱'),
('广州边境牧羊犬狗场 广州哪里有卖边牧犬小狗 边境牧羊犬小狗'),
('广州跳跳犬舍 纯种憨厚老实巴哥幼犬 小型短毛犬 纯种健康'),
('广州地区金毛多少钱一只巡回犬赛级品质 签协议 健康血统有保'),
('广州狗场直销阿拉斯加金毛泰迪哈士奇萨摩耶秋田德牧等各种名犬');

建立sphinx配置

source src1
{
        type                    = mysql
        sql_query_pre = SET NAMES utf8
        sql_host                = localhost
        sql_user                = test
        sql_pass                = 
        sql_db                  = test
        sql_port                = 3306  # optional, default is 3306
        sql_query               =   SELECT id,  title   FROM documents
        sql_field_string = title
}


index test1
{
        source                  = src1
        path                    = /usr/local/sphinx-jieba/var/data/test1
        charset_type = utf-8
        chinese_dictionary = /usr/local/sphinx/etc/xdict
}



indexer
{
        mem_limit               = 128M
}


searchd
{
        listen                  = 9312
        listen                  = 9306:mysql41
        log                     = /usr/local/sphinx-jieba/var/log/searchd.log
        query_log               = /usr/local/sphinx-jieba/var/log/query.log
        read_timeout            = 5
        max_children            = 30
        pid_file                = /usr/local/sphinx-jieba/var/log/searchd.pid
        seamless_rotate         = 1
        preopen_indexes         = 1
        unlink_old              = 1
        workers                 = threads # for RT to work
        binlog_path             = /usr/local/sphinx-jieba/var/data
}

复制词典

sudo cp cppjieba/dict/* /usr/local/sphinx-jieba/etc/ -r
cd /usr/local/sphinx-jieba/
sudo cp etc/jieba.dict.utf8 etc/xdictjieba.dict.utf8
sudo cp etc/user.dict.utf8 etc/xdictuser.dict.utf8
sudo cp etc/hmm_model.utf8 etc/xdicthmm_model.utf8
sudo cp etc/idf.utf8 etc/xdictidf.utf8
sudo cp etc/stop_words.utf8 etc/xdictstop_words.utf8

建立索引,并启动sphinx服务

sudo bin/indexer --all
sudo bin/searchd

测试

mysql -h 127.0.0.1 -P 9306

连接mysql

select * from test1 where match('宠物狗') limit 1000;

Percona安装Tokudb引擎

https://www.percona.com/doc/percona-server/LATEST/tokudb/tokudb_installation.html

添加percona源

参考:https://mirrors.gzqdn.org/help/percona/

安装libjemalloc

sudo apt install libjemalloc-dev

添加libjemalloc到配置

在/etc/mysql/percona-server.conf.d/mysqld_safe.cnf文件中添加

[mysqld_safe]
malloc-lib= /usr/include/jemalloc

关闭Transparent huge pages

查看Transparent huge pages状态

cat /sys/kernel/mm/transparent_hugepage/enabled

关闭Transparent huge pages需要以root身份运行

echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo never > /sys/kernel/mm/transparent_hugepage/defrag

安装包

apt-get install percona-server-tokudb-5.7

激活Tokudb引擎

sudo ps_tokudb_admin --enable -uroot -pPassw0rd

查看状态

mysql> SHOW ENGINES;
mysql> SELECT @@tokudb_version;

mariadb

https://mariadb.com/kb/en/mariadb/enabling-tokudb/

mysql通过存储过程以及事务批量生成数据

存储过程加上事务能够提高插入效率:

CREATE DEFINER=`root`@`%` PROCEDURE `autoinsert`(IN NUM INT)
BEGIN
  DECLARE INIT_NUM INT DEFAULT 0 ;
    START TRANSACTION;
    WHILE(INIT_NUM < NUM)
    DO
        insert into `user`(`name`,`city_id`) values(rand_str(10),rand_int(2));
        SET INIT_NUM = INIT_NUM+1;
    END WHILE;
    COMMIT;
    END

CREATE TABLE `test` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `forum_id` bigint(20) unsigned NOT NULL,
  `created` datetime NOT NULL,
  PRIMARY KEY (`id`),
  KEY `test_forum_id_IDX` (`forum_id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=750136 DEFAULT CHARSET=utf8mb4

DROP PROCEDURE IF EXISTS test.BatchInsertTest;
delimiter //
CREATE PROCEDURE BatchInsertTest(IN loop_time INT)
  BEGIN
      DECLARE Var INT;
      SET Var = 0;
      START TRANSACTION;
      WHILE Var < loop_time DO
          INSERT INTO `test` ( `forum_id`, `created`) select  FLOOR(1 + (RAND() * 1000)),FROM_UNIXTIME(UNIX_TIMESTAMP()-(RAND() * 30 * 86400));
          SET Var = Var + 1;
      END WHILE;
     COMMIT;
  END;
  //
delimiter ;

CALL BatchInsertTest(10000);

select count(*) from test;

mysql生成固定位数随机字母以及数字

mysql生成固定位数随机字母以及数字

随机数字

CREATE DEFINER=`root`@`%` FUNCTION `rand_int`(counts INTEGER) RETURNS varchar(20) CHARSET utf8
BEGIN
       DECLARE sTemp VARCHAR(20);
    DECLARE sTempCounts INTEGER;
       SET sTemp = CONCAT( ROUND(ROUND(RAND(),counts)*(POW(10,counts))),'');

    IF(CHAR_LENGTH(sTemp)<counts) THEN

      SET sTempCounts = counts - CHAR_LENGTH(sTemp);
      SET sTemp = CONCAT(sTemp, RIGHT(CONCAT(POW(10,sTempCounts),''),sTempCounts));
    END IF;
      RETURN sTemp;
END

随机字母:

CREATE DEFINER=`root`@`%` FUNCTION `rand_str`(  
    f_num INT UNSIGNED
    ) RETURNS varchar(200) CHARSET latin1
BEGIN
      DECLARE i INT UNSIGNED DEFAULT 0;   
      DECLARE v_result VARCHAR(200) DEFAULT '';   
      DECLARE v_dict VARCHAR(200) DEFAULT '';  
      SET v_dict = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';  
      SET v_dict = LPAD(v_dict,200,v_dict);  
      WHILE i < f_num   
      DO   
        SET v_result = CONCAT(v_result,SUBSTR(v_dict,CEIL(RAND()*200),1));  
        SET i = i + 1;   
      END WHILE;    
      RETURN v_result;   
END