安装 hbase
下载
不能下载太新的版本,下载地址在: http://mirror.bit.edu.cn/apache/hbase/
cd /data/server
wget http://mirror.bit.edu.cn/apache/hbase/hbase-0.94.27/hbase-0.94.27.tar.gz
tar zxvf hbase-1.1.2-bin.tar.gz
cd hbase-0.94.27
修改配置
修改conf/hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>file:///data/data/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>false</value>
</property>
</configuration>
启动
bin/start-hbase.sh
运行
bin/hbase shell
安装nutch
下载
下载地址: http://mirror.bit.edu.cn/apache/nutch/
cd /data/server
wget http://mirror.bit.edu.cn/apache/nutch/2.3/apache-nutch-2.3-src.tar.gz
tar -zxvf apache-nutch-2.3-src.tar.gz
cd apache-nutch-2.3
配置
ivy/ivy.xml
<dependency org="org.apache.gora" name="gora-hbase" rev="0.5" conf="*->default" />
conf/gora.properties
gora.datastore.default=org.apache.gora.hbase.store.HBaseStore
build
ant clean
ant runtime
config
runtime/local/conf/nutch-site.xml
<configuration>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.hbase.store.HBaseStore</value>
</property>
<property>
<name>plugin.includes</name>
<!-- do **NOT** enable the parse-html plugin, if you want proper HTML parsing. Use something like parse-tika! -->
<value>protocol-httpclient|urlfilter-regex|parse-(text|tika|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|indexer-elastic</value>
</property>
<property>
<name>db.ignore.external.links</name>
<value>true</value>
<!-- do not leave the seeded domains (optional) -->
</property>
<!-- elasticsearch index properties -->
<property>
<name>elastic.host</name>
<value>localhost</value>
<description>The hostname to send documents to using TransportClient. Either host and port must be defined or cluster.
</description>
</property>
<property>
<name>elastic.cluster</name>
<value>elasticsearch</value>
<description>The cluster name to discover. Either host and potr must be defined or cluster.
</description>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
</property>
<property>
<name>http.agent.name</name>
<value>Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36</value>
</property>
<property>
<name>http.agent.description</name>
<value>Programer's search</value>
</property>
<property>
<name>http.robots.403.allow</name>
<value>true</value>
</property>
<property>
<name>http.agent.url</name>
<value>http://hisearch.cn</value>
</property>
<property>
<name>http.verbose</name>
<value>true</value>
</property>
<property>
<name>http.accept.language</name>
<value>zh,zh-CN;q=0.8,en;q=0.6</value>
</property>
<property>
<name>http.agent.version</name>
<value>0.1</value>
</property>
</configuration>
runtime/local/conf/hbase-site.xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>file:///data/data/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>false</value>
</property>
</configuration>
run
cd runtime/local
mkdir seed
echo "http://www.cnblogs.com" > seed/urls.txt
bin/nutch inject seed/
bin/nutch generate -topN 10
bin/nutch fetch -all
bin/nutch parse -all
bin/nutch updatedb
bin/craw seed/ testCraw 3
bin/nutch elasticindex elasticsearch -all