package step1;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class Task {
/**
* @param filePath 文件路径:backups/www.ctrip.com.txt/
* @return
* @throws IOException
*/
public Document getHtml1(String filePath) throws IOException{
/********** Begin **********/
File file = new File(filePath);
Document d = Jsoup.parse(file, "UTF-8", "/backups/www.ctrip.com.txt/");
return d;
/********** End **********/
}
/**
*
* @param filePath 文件路径:backups/hotels.ctrip.com_domestic-city-hotel.txt/
* @return
* @throws IOException
*/
public Document getHtml2(String filePath) throws IOException{
/********** Begin **********/
File fl = new File(filePath);
Document dt = Jsoup.parse(fl, "UTF-8", "/backups/hotels.ctrip.com_domestic-city-hotel.txt/");
return dt;
/********** End **********/
}
}
第2关 解析并提取HTML 元素(一)
package step2;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc1(String filePath) throws IOException{
/********** Begin **********/
File file = new File(filePath);
Document document = Jsoup.parse(file, "UTF-8","/backups/www.ctrip.com.txt");
return document;
/********** End **********/
}
public Document getDoc2(String filePath) throws IOException{
/********** Begin **********/
File fe = new File(filePath);
Document dt = Jsoup.parse(fe, "UTF-8","/backups/you.ctrip.com.txt");
return dt;
/********** End **********/
}
//获取所有链接
public Elements getLinks(Document doc){
/********** Begin **********/
return doc.select("link[href]");
/********** End **********/
}
//获取第一个class为"pop_attention"的div
public Element getDiv(Document doc){
/********** Begin **********/
return doc.select("div.pop_attention").first();
/********** End **********/
}
//获取所有li之后的i标签
public Elements getI(Document doc){
/********** Begin **********/
return doc.select("li > i");
/********** Edn **********/
}
}
第3关 解析并提取HTML 元素(二)
package step3;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc(String filePath) throws IOException{
/********** Begin **********/
return Jsoup.parse(new File(filePath), "uft-8");
/********** End **********/
}
//获取所有链接
public List<String> getLinks(Document doc){
/********** Begin **********/
Elements select = doc.select("a[href]");
List<String> list = new ArrayList<>();
for (Element element : select){
String temp = element.attr("href");
if(!temp.startsWith("http")) temp = "http:" + temp;
list.add(element.tagName() + "$" + temp + "(" + element.text() + ")");
}
return list;
/********** End **********/
}
//获取图片
public List<String> getMedia(Document doc){
/********** Begin **********/
Elements img = doc.select("img");
List<String> list = new ArrayList<>();
for (Element element : img){
String temp = element.attr("src");
if(!temp.startsWith("http")) temp = "http:" + temp;
list.add(element.tagName() + "$" + temp);
}
return list;
/********** End **********/
}
//获取link[href]链接
public List<String> getImports(Document doc){
/********** Begin **********/
Elements link = doc.select("link");
List<String> list = new ArrayList<>();
for (Element value : link){
String temp = value.attr("href");
if(!temp.startsWith("http")) temp = "http:" + temp;
list.add(value.tagName() + "$" + temp + "(" + value.attr("rel") + ")");
}
return list;
/********** End **********/
}
}
package step4;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc(String filePath) throws IOException{
/********** Begin **********/
File file = new File(filePath);
Document doc = Jsoup.parse(file, "uft-8", "/backups/hotels.ctrip.com_domestic-city-hotel.txt");
return doc;
/********** End **********/
}
/**
* 获取所有城市返回城市信息集合
* @param doc
* @return
*/
public List<HotelCity> getAllCitys(Document doc){
/********** Begin **********/
Elements select1 = doc.select("dl.layoutfix");
List<HotelCity> list = new ArrayList<>();
for (Element element : select1.select("a")){
HotelCity hotelCity = new HotelCity();
hotelCity.setCityId(element.attr("href").replaceAll("[^(0-9)]", ""));
hotelCity.setCityName(element.text());
hotelCity.setPinyin(element.attr("href").split("/")[2].replaceAll("[^(a-zA-Z)]",""));
hotelCity.setHeadPinyin("A");
list.add(hotelCity);
}
return list;
/********** End **********/
}
}