A:关于代码的注释没有非常详细说明,但代码都是经过实际数据抓取反复使用的,有兴趣和需要可以搭建好环境后尝试
B:针对抓取效率没有进行优化,在时间上有要求的需要自行调整判断和增加逻辑
1、alert弹窗处理
R
# 主动检查是否有alert弹窗,如果有则关闭
handle_alert_if_exists <- function(remdr) {
tryCatch({
alert_text <- remdr$getAlertText()[[1]]
cat("⚠️ 捕获弹窗:", alert_text, "\n")
remdr$dismissAlert()
Sys.sleep(1) # 等弹窗关闭后再继续
}, error = function(e) {
# 没弹窗时不处理
})
}
2、抓取数据的关键词
R
#需要搜索的内容
search_content <- c("rna","cell","dna","protein")
spider_url_base <- "https://ngdc.cncb.ac.cn/databasecommons/#stat"
3、隐藏selenium的特征
R
# 隐藏Selenium特征的JavaScript代码
hide_automation_script <- "
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
window.chrome = {
runtime: {},
// etc.
};
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
"
4、启动selenium及拉起浏览器
R
启动selenium服务
system("java -jar \"你的文件位置/selenium-server-standalone-3.141.59.jar\" -port 4449",wait = FALSE,invisible = FALSE)
remdr <- remoteDriver(browserName ="firefox",
# remoteServerAddr = "127.0.0.1",
version='0.36.0',
# extraCapabilities = list("moz:firefoxOptions" = list()),
port=4449L)
#cmd中查看端口占用netstat -ano
#powershell中查看端口是否占用Get-NetTCPConnection | Select-Object LocalAddress, LocalPort, State, OwningProcess | Sort-Object LocalPort
remdr$open()
5、数据抓取及保存
A:这部分涉及到2层循环,1、所有关键词的外循环,2、当前关键词的所有页数都内循环
B:有涉及到翻页按钮的操作,是否抓取至最后页码的判断
R
# 导航到目标网页
ptm <- proc.time()#记录消耗时间
remdr$navigate(spider_url_base )#注意当前网页跳转需要消耗的时间
proc.time() - ptm
# 在每次请求前执行JavaScript
remdr$executeScript(script = hide_automation_script)
#准备好存储数据的表
bio_database_all <- data.frame()
for (i in seq_along(search_content)) {
Sys.sleep(runif(1, 5, 10))
# 定位搜索框并输入内容
search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']") # 替换为实际的搜索框XPath
Sys.sleep(runif(1, 5, 10))
search_box$sendKeysToElement(list(search_content[i])) # 替换为实际的搜索查询
# 定位搜索按钮并点击
search_button <- remdr$findElement(using = "xpath", value = "//*[@id='search']") # 替换为实际的搜索按钮XPath
Sys.sleep(runif(1, 3, 6))#页面加载,不然获取的页数可能有问题
search_button$clickElement()
# 打印进度
cat(sprintf("Progress: %s",str_c( search_content[],"的数据库信息已开始提取,当前时间:",Sys.time())),sep='\n')
#内循环的数据存储都是临时,如果不清空就累积
bio_database_eachword<- data.frame()
loop_times=0#初始循环次数
# 设置每1页显示数据条数
Sys.sleep(runif(1, 6, 9))
select_pagenum_per <- remdr$findElement(using = "xpath", value = "//div[@class='selection ui dropdown']")
select_pagenum_per$clickElement()
Sys.sleep(runif(1, 5, 8))#页面加载,不然设置的条数可能有问题
select_pagenum <- remdr$findElement(using = "xpath", value = "//div[@class='menu transition visible']//div[@data-value='20']")
select_pagenum$clickElement()
Sys.sleep(runif(3, 5, 7))
# 4. 重新获取 HTML 内容(非常重要)
page_html_source_new <- remdr$getPageSource()[[1]]
page_html_content_new <- rvest::read_html(page_html_source_new)
page_num_max <- page_html_content_new %>%
html_nodes(xpath = "//a[@class='paginate_button item ' and @data-dt-idx='7']") %>%
html_text()
print(page_num_max)
#当前关键词数据抓取循环
repeat{
ptm <- proc.time()#记录消耗时间
Sys.sleep(runif(1, 3, 6))
# 获取完整HTML内容
page_html_source <- remdr$getPageSource()[[1]]
page_html_content <- rvest::read_html(page_html_source)
#数据部分
database_name=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]")%>%
html_text()
database_link_intro=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[1]")%>%
html_attr('href')
database_link=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[2]")%>%
html_attr('href')
full_name=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[2]")%>%
html_text()
data_object=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[3]")%>%
html_text()
data_type=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[4]")%>%
html_text()
database_category=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[5]")%>%
html_text()
keyword=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[6]")%>%
html_text()
location=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[7]")%>%
html_text()
host_institution=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[8]")%>%
html_text()
founded_year=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[9]")%>%
html_text()
citation=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[10]")%>%
html_text()
z_index=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[11]")%>%
html_text()
description=page_html_content %>%
html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[12]")%>%
html_text()
bio_database_pageeach=data.frame(database_name,
database_link_intro,
database_link,
full_name,
data_object,
data_type,
database_category,
keyword,
location,
host_institution,
founded_year,
citation,
z_index,
description,
# description_all,
search_word=search_content[i])
bio_database_eachword=bind_rows(bio_database_eachword,bio_database_pageeach)
# print(description_all)
#进度
current_page=page_html_content %>%
html_nodes(xpath = "//a[@class='paginate_button item active']")%>%
html_text()
cat(sprintf("Progress: %s",str_c( search_content[i],"的数据库信息已开始提取至第",current_page,"页,共",
page_num_max,"页数据,当前时间:",Sys.time())),sep='\n')
proc.time() - ptm
# current_page_url=remdr$getCurrentUrl()[[1]]#获取当前页面的网址
#当前页面下一页按钮情况
current_page_nextbutton_back1= page_html_content %>%
html_node(xpath = "//a[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%
html_attr("class")
current_page_nextbutton_back2=page_html_content %>%
html_node(xpath = "//div[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%
html_attr("class")
current_page_nextbutton=if(is.na(current_page_nextbutton_back1)==TRUE){
current_page_nextbutton_back2
}else{
current_page_nextbutton_back1
}
#是否跳出循环,当前是否到达最后一页
loop_times=loop_times+1
cat("当前页面按钮是否可以翻页",current_page_nextbutton,"第",loop_times,"页")
# cat("翻页前后2个页面是否相同",current_page_nextbutton,"第",loop_times,"个",current_page_nextbutton==next_page_nextbutton,sep="\n")
if(str_detect(current_page_nextbutton,'disabled')==TRUE){
cat("已到达最后一页,下一页按钮不可点击",
"当前页按钮:",current_page_nextbutton,sep='\n')
break
}
handle_alert_if_exists(remdr)
#翻页下一页操作
nextpage_button_xpath <- "//a[@class='paginate_button item next']"
nextpage_button <- remdr$findElement(using = "xpath", value = "//a[@class='paginate_button item next']") #
Sys.sleep(runif(1, 3, 6))
nextpage_button$clickElement()#如果间隔很短点击可能翻页不能及时生效
# 翻页后等待页面加载
Sys.sleep(runif(1, 4, 7))
}
bio_database_all=bind_rows(bio_database_all,bio_database_eachword)
Sys.sleep(runif(1, 5, 7))
#清空搜索框内容
# 替换为实际的搜索框XPath,但这个网页的搜索框又变化,搜索后有2个输入位置
search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']")
#这个定位的是小的搜索框
# search_box <- remdr$findElement(using = "xpath", value = "//div[@class='ui action input']/input[@id='term']")
Sys.sleep(runif(1, 3, 6))
search_box$clearElement()#清除搜索框内容
# remdr$close()
}