sudo apt install default-jdk'
java -version
openjdk version "11.0.22" 2024-01-16
vi .bashrc
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
爬梯子下载源代码 Apache Nutch™ -- Downloads
mkdir -p urls
cd urls
touch seed.txt
里面放入我的网站地址
bin/nutch inject crawl/crawldb urls
显示
Injecting seed URL file file:/data/apache-nutch-1.19/urls/seed.txt
Total new urls injected: 1
s1=`ls -d crawl/segments/2* | tail -1`
echo $s1
bin/nutch generate crawl/crawldb crawl/segments
apache nutch No agents listed in 'http.agent.name' property.
conf/ nutch-site.xml
<property>
<name>http.agent.name</name>
<value>MyNutchBot/1.0</value>
</property>
export APACHE_SOLR_HOME=/data/solr-8.11.3
export NUTCH_RUNTIME_HOME=/data/apache-nutch-1.19
${APACHE_SOLR_HOME}/bin/solr start -force
open file limit is currently 1024
vi /etc/security/limits.conf
* soft nofile 4096
* hard nofile 4096
Started Solr server on port 8983 (pid=29369). Happy searching!
${APACHE_SOLR_HOME}/bin/solr start -force
${APACHE_SOLR_HOME}/bin/solr create -c nutch -d ${APACHE_SOLR_HOME}/server/solr/configsets/nutch/conf/ -force
ls crawl/segments/
bin/nutch index crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segments/20240326063028/ -filter -normalize -deleteGone
https://dlcdn.apache.org/lucene/solr/8.11.3/solr-8.11.3.tgz
https://nutch.apache.org/download/
https://dlcdn.apache.org/nutch/1.19/apache-nutch-1.19-bin.tar.gz
https://cwiki.apache.org/confluence/display/NUTCH/NutchTutorial
去掉robot的处理