背景: 项目需要从html 提取说明书目录
实现: 由于html是包含所有内容,所以将其中目录部分手动重新生成一个html 文件dir26.html
python
python
import requests
from bs4 import BeautifulSoup
import json
filename = "dir26.html" # 替换为实际的文件路径
with open(filename, "r") as file:
html = file.read()
soup = BeautifulSoup(html, "html.parser")
results = soup.find_all('div') # 根据需要修改选择器
# print("soup:",soup)
# print("results:",results)
# 提取数据并转换为JSON格式
data = []
for result in results:
#print(result.text)
resulttext = result.text
directory = resulttext.split(" ")[0] // 取第一个空格之前的内容
print(directory)
page = resulttext.split(" ")[2]//取第三个空格之前的内容
print(page)
data.append({'directory': directory, 'page': page})
# 写入JSON文件
with open("manualdir.json", "w") as file:
json.dump(data, file, ensure_ascii=False)
android中读取Json 中的内容代码:
java
// 存储Json中的信息
private LinkedHashMap<String, String> mDirMap = new LinkedHashMap<String, String>();
private void initDirectory() {
byte[] buffer;
try {
//将json文件读取到buffer数组中
InputStream is =
getContext().getResources().getAssets().open("manualdir.json");
buffer = new byte[is.available()];
is.read(buffer);
} catch (IOException e) {
Log.w(TAG, "manual dir json IOException e:" + e);
return;
}
// 将字符数组转换为UTF-8编码的字符串
String json;
try {
json = new String(buffer, "UTF-8");
} catch (UnsupportedEncodingException e) {
Log.w(TAG, "manual dir json UnsupportedEncodingException e" + e);
return;
}
//将字符串json转换为json对象,以便于取出数据
try {
JSONArray jsa = new JSONArray(json);
for (int i = 0; i < jsa.length();i++){
JSONObject jso = jsa.getJSONObject(i);
Log.w(TAG, " jsa.length():" + jsa.length());
String title = jso.optString("directory");//标题
Log.w(TAG, "title:"+title);
int page = jso.optInt("page");//目录页数
mDirMap.put(title, "#pf"+Integer.toHexString(page));
}
} catch (JSONException e) {
Log.w(TAG, "manual dir json exception e:"+e);
}
}