实际使用中发现parse耗时很多,而且经常搜索不到词。弃坑。
下载并安装sphinx-jieba
$ git clone https://github.com/c4ys/sphinx-jieba
$ cd sphinx-jieba
$ git submodule update --init --recursive
$ sudo apt install gcc cmake automake g++
$ sudo apt install libmysqld-dev
$ ./configure --prefix=/usr/local/sphinx-jieba
$ cp cppjieba/include/cppjieba src/ -r
$ cp cppjieba/deps/limonp src/ -r
$ sudo make install
建立数据库
sql如下
CREATE TABLE documents ( id INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT, title VARCHAR(255) NOT NULL );
REPLACE INTO documents ( title ) VALUES
('广州狗场直销泰迪边牧阿拉斯加等各名犬 微信视频同步'),
('出售阿拉金毛拉多泰迪萨摩哈士奇等30多个品种 保健康可送货'),
('广州哪里买纯种哈士奇 雪橇犬哈士奇多少钱'),
('广州边境牧羊犬狗场 广州哪里有卖边牧犬小狗 边境牧羊犬小狗'),
('广州跳跳犬舍 纯种憨厚老实巴哥幼犬 小型短毛犬 纯种健康'),
('广州地区金毛多少钱一只巡回犬赛级品质 签协议 健康血统有保'),
('广州狗场直销阿拉斯加金毛泰迪哈士奇萨摩耶秋田德牧等各种名犬');
建立sphinx配置
source src1
{
type = mysql
sql_query_pre = SET NAMES utf8
sql_host = localhost
sql_user = test
sql_pass =
sql_db = test
sql_port = 3306 # optional, default is 3306
sql_query = SELECT id, title FROM documents
sql_field_string = title
}
index test1
{
source = src1
path = /usr/local/sphinx-jieba/var/data/test1
charset_type = utf-8
chinese_dictionary = /usr/local/sphinx/etc/xdict
}
indexer
{
mem_limit = 128M
}
searchd
{
listen = 9312
listen = 9306:mysql41
log = /usr/local/sphinx-jieba/var/log/searchd.log
query_log = /usr/local/sphinx-jieba/var/log/query.log
read_timeout = 5
max_children = 30
pid_file = /usr/local/sphinx-jieba/var/log/searchd.pid
seamless_rotate = 1
preopen_indexes = 1
unlink_old = 1
workers = threads # for RT to work
binlog_path = /usr/local/sphinx-jieba/var/data
}
复制词典
sudo cp cppjieba/dict/* /usr/local/sphinx-jieba/etc/ -r
cd /usr/local/sphinx-jieba/
sudo cp etc/jieba.dict.utf8 etc/xdictjieba.dict.utf8
sudo cp etc/user.dict.utf8 etc/xdictuser.dict.utf8
sudo cp etc/hmm_model.utf8 etc/xdicthmm_model.utf8
sudo cp etc/idf.utf8 etc/xdictidf.utf8
sudo cp etc/stop_words.utf8 etc/xdictstop_words.utf8
建立索引,并启动sphinx服务
sudo bin/indexer --all
sudo bin/searchd
测试
mysql -h 127.0.0.1 -P 9306
连接mysql
select * from test1 where match('宠物狗') limit 1000;