月度归档:2015年12月

elasticsearch highlighting

PUT /my_index

{
  "mappings": {
    "doc_type": {
      "properties": {
        "content": {
          "type": "string",
          "term_vector": "with_positions_offsets",
          "analyzer": "snowball"
        }
      }
    }
  }
}

POST /_search

{
  "query": {
    "multi_match": {
      "query": "公司",
      "type": "best_fields",
      "fields": [
        "title",
        "content"
      ]
    }
  },
  "filter": {
    "term": {
      "site": "baidu.com"
    }
  },
  "highlight": {
    "fields": {
      "content": {
        "fragment_size": 100,
        "number_of_fragments": 2,
        "no_match_size": 100,
        "term_vector": "with_positions_offsets",
        "boundary_chars": " 。,?",
        "max_boundary_size": 80,
        "force_source": true
      }
    }
  }
}

使用elementtree处理大的xml

这个xml超过了30G,关键是iterparse以及e.clear()

from elementtree.ElementTree import iterparse
from datetime import datetime
import redis
import json
import lxml.html
import re
import traceback
import time
import cgi


def main():
    redisConn = redis.from_url("redis://localhost:6379/0")
    i = 0
    xmlfile = "/data/download/Posts.xml"
    for event, e in iterparse(xmlfile):
        if e.tag == "row" and e.get("PostTypeId") == "1":
            try:
                data = {
                    "url": "http://stackoverflow.com/questions/" + e.get("Id"),
                    "title": cgi.escape(e.get("Title")),
                    "content": cgi.escape(lxml.html.fromstring(e.get("Body")).text_content()),
                    "tags": cgi.escape(",".join(re.findall("<([^>]+)>", e.get("Tags")))),
                    'site': 'stackoverflow',
                    "timestamp": datetime.now().isoformat()
                }
                redisConn.lpush('ResultQueue', json.dumps(data))
            except:
                traceback.print_exc()
                print e.attrib
                continue
        i += 1
        if i % 1000 == 0:
            time.sleep(5);
            print i
        e.clear()


if __name__ == '__main__':
    main()

centos elasticsearch 安装

download and install via https://www.elastic.co/downloads/

yum install https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/rpm/elasticsearch/2.1.0/elasticsearch-2.1.0.rpm

make data and logs dir

mkdir -p /data/elastic/data
mkdir -p /data/elastic/logs
chown -R elasticsearch:elasticsearch /data/elastic/

edit config /etc/elasticsearch.yml

path.data: /data/elastic/data
path.logs: /data/elastic/logs
network.host: 127.0.0.1

edit start script /etc/init.d/elasticsearch

LOG_DIR="/data/elastic/logs"
DATA_DIR="/data/elastic/data"

install java jdk

yum install java-1.8.0-openjdk

start

systemctl enalbe elasticsearch
/etc/init.d/elasticsearch start

test

/etc/init.d/elasticsearch status
curl http://127.0.0.1:9200/

centos Redis 安装

make data dir for redis

mkdir /data/redis
chown -R redis:redis /data/redis

modify config /etc/redis.conf

daemonize yes
dir /data/redis/
appendonly yes
requirepass mypassword

restart

systemctl enalbe redis
systemctl start redis