标签归档:sphinxsearch

sphinx-jieba试用笔记

实际使用中发现parse耗时很多,而且经常搜索不到词。弃坑。

下载并安装sphinx-jieba

$ git clone https://github.com/c4ys/sphinx-jieba
$ cd sphinx-jieba
$ git submodule update --init --recursive
$ sudo apt install gcc cmake automake g++
$ sudo apt install libmysqld-dev
$ ./configure --prefix=/usr/local/sphinx-jieba
$ cp cppjieba/include/cppjieba src/ -r
$ cp cppjieba/deps/limonp src/ -r
$ sudo make install

建立数据库

sql如下

CREATE TABLE documents ( id INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT, title VARCHAR(255) NOT NULL );

REPLACE INTO documents ( title ) VALUES
('广州狗场直销泰迪边牧阿拉斯加等各名犬 微信视频同步'),
('出售阿拉金毛拉多泰迪萨摩哈士奇等30多个品种 保健康可送货'),
('广州哪里买纯种哈士奇 雪橇犬哈士奇多少钱'),
('广州边境牧羊犬狗场 广州哪里有卖边牧犬小狗 边境牧羊犬小狗'),
('广州跳跳犬舍 纯种憨厚老实巴哥幼犬 小型短毛犬 纯种健康'),
('广州地区金毛多少钱一只巡回犬赛级品质 签协议 健康血统有保'),
('广州狗场直销阿拉斯加金毛泰迪哈士奇萨摩耶秋田德牧等各种名犬');

建立sphinx配置

source src1
{
        type                    = mysql
        sql_query_pre = SET NAMES utf8
        sql_host                = localhost
        sql_user                = test
        sql_pass                = 
        sql_db                  = test
        sql_port                = 3306  # optional, default is 3306
        sql_query               =   SELECT id,  title   FROM documents
        sql_field_string = title
}


index test1
{
        source                  = src1
        path                    = /usr/local/sphinx-jieba/var/data/test1
        charset_type = utf-8
        chinese_dictionary = /usr/local/sphinx/etc/xdict
}



indexer
{
        mem_limit               = 128M
}


searchd
{
        listen                  = 9312
        listen                  = 9306:mysql41
        log                     = /usr/local/sphinx-jieba/var/log/searchd.log
        query_log               = /usr/local/sphinx-jieba/var/log/query.log
        read_timeout            = 5
        max_children            = 30
        pid_file                = /usr/local/sphinx-jieba/var/log/searchd.pid
        seamless_rotate         = 1
        preopen_indexes         = 1
        unlink_old              = 1
        workers                 = threads # for RT to work
        binlog_path             = /usr/local/sphinx-jieba/var/data
}

复制词典

sudo cp cppjieba/dict/* /usr/local/sphinx-jieba/etc/ -r
cd /usr/local/sphinx-jieba/
sudo cp etc/jieba.dict.utf8 etc/xdictjieba.dict.utf8
sudo cp etc/user.dict.utf8 etc/xdictuser.dict.utf8
sudo cp etc/hmm_model.utf8 etc/xdicthmm_model.utf8
sudo cp etc/idf.utf8 etc/xdictidf.utf8
sudo cp etc/stop_words.utf8 etc/xdictstop_words.utf8

建立索引,并启动sphinx服务

sudo bin/indexer --all
sudo bin/searchd

测试

mysql -h 127.0.0.1 -P 9306

连接mysql

select * from test1 where match('宠物狗') limit 1000;

sphinx-for-chinese快速配置

建立累计表

CREATE TABLE sph_counter
(
    counter_id INTEGER PRIMARY KEY NOT NULL,
    max_doc_id bigint NOT NULL
);

下载并安装sphinx-for-chinese

下载地址:http://www.sphinx-search.com/

wget http://www.sphinx-search.com/downloads/sphinx-for-chinese-2.2.1-dev-r4311.tar.gz
tar -xvf sphinx-for-chinese-2.2.1-dev-r4311.tar.gz
cd sphinx-for-chinese-2.2.1-dev-r4311
./configure --prefix=/usr/local/sphinx-for-chinese --with-mysql --enable-id64
make -j4 install

下载词典:

wget https://sphinx-for-chinese.googlecode.com/files/xdict_1.1.tar.gz
tar -xzf xdict_1.1.tar.gz
/usr/local/sphinx-for-chinese/bin/mkdict xdict_1.1.txt /usr/local/sphinx-for-chinese/etc/xdict

在索引配置项中添加以下两项:

    charset_type = utf-8
    chinese_dictionary = /usr/local/sphinx-for-chinese/etc/xdict

全局配置

indexer
{
    mem_limit            = 512M
}
searchd
{
    listen          = 9312
    listen          = 9306:mysql41
    log         = /usr/local/sphinx-for-chinese/var/log/searchd.log
    query_log       = /usr/local/sphinx-for-chinese/var/log/query.log
    read_timeout        = 5
    max_children        = 30
    pid_file        = /usr/local/sphinx-for-chinese/var/log/searchd.pid
    max_matches     = 1000
    seamless_rotate     = 1
    preopen_indexes     = 1
    unlink_old      = 1
    workers         = threads # for RT to work
#   binlog_path     = /usr/local/sphinx-for-chinese/var/data 关闭binlog日志
}

测试代码

<?php

$s = new SphinxClient;
$s->setServer("192.168.2.101", 9312);
$s->setMatchMode(SPH_MATCH_ALL);
$s->setMaxQueryTime(3);

$keywords = array(
    'the',
    '一分钱补差价',
    '一分钱 补差价',
    '补差价 一分钱',
    '一分钱乱补差价',
    '一分钱 乱 补差价',
    '一分钱',
    '补差价',
);
foreach ($keywords as $k) {
 $result = $s->query($k);
 echo $k,"\n";
 if(isset($result['matches']) ) {
    print_r($result['matches']);
 }
}