R语言基于Rselenium模拟浏览器抓取DatabaseCommons数据-连载NO.04

A：关于代码的注释没有非常详细说明，但代码都是经过实际数据抓取反复使用的，有兴趣和需要可以搭建好环境后尝试

B：针对抓取效率没有进行优化，在时间上有要求的需要自行调整判断和增加逻辑

1、alert弹窗处理

R 复制代码

# 主动检查是否有alert弹窗，如果有则关闭
handle_alert_if_exists <- function(remdr) {
  tryCatch({
    alert_text <- remdr$getAlertText()[[1]]
    cat("⚠️ 捕获弹窗：", alert_text, "\n")
    remdr$dismissAlert()
    Sys.sleep(1)  # 等弹窗关闭后再继续
  }, error = function(e) {
    # 没弹窗时不处理
  })
}

2、抓取数据的关键词

R 复制代码

#需要搜索的内容
search_content <- c("rna","cell","dna","protein")
spider_url_base <- "https://ngdc.cncb.ac.cn/databasecommons/#stat"

3、隐藏selenium的特征

R 复制代码

# 隐藏Selenium特征的JavaScript代码
hide_automation_script <- "
Object.defineProperty(navigator, 'webdriver', {
  get: () => undefined
});
window.chrome = {
  runtime: {},
  // etc.
};
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
  parameters.name === 'notifications' ?
    Promise.resolve({ state: Notification.permission }) :
    originalQuery(parameters)
);
"

4、启动selenium及拉起浏览器

R 复制代码

启动selenium服务

system("java -jar \"你的文件位置/selenium-server-standalone-3.141.59.jar\"  -port 4449",wait = FALSE,invisible = FALSE)

remdr <- remoteDriver(browserName ="firefox",
                      # remoteServerAddr = "127.0.0.1",
                      version='0.36.0',
                      # extraCapabilities = list("moz:firefoxOptions" = list()),
                      port=4449L)
#cmd中查看端口占用netstat -ano
#powershell中查看端口是否占用Get-NetTCPConnection | Select-Object LocalAddress, LocalPort, State, OwningProcess | Sort-Object LocalPort
remdr$open()

5、数据抓取及保存

A：这部分涉及到2层循环，1、所有关键词的外循环，2、当前关键词的所有页数都内循环

B：有涉及到翻页按钮的操作，是否抓取至最后页码的判断

R 复制代码

# 导航到目标网页
ptm <- proc.time()#记录消耗时间
remdr$navigate(spider_url_base )#注意当前网页跳转需要消耗的时间
proc.time() - ptm
# 在每次请求前执行JavaScript
remdr$executeScript(script = hide_automation_script)

#准备好存储数据的表
bio_database_all <- data.frame()
for (i in seq_along(search_content)) {
  Sys.sleep(runif(1, 5, 10))
  # 定位搜索框并输入内容
  search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']") # 替换为实际的搜索框XPath
  Sys.sleep(runif(1, 5, 10))
  search_box$sendKeysToElement(list(search_content[i])) # 替换为实际的搜索查询
  # 定位搜索按钮并点击
  search_button <- remdr$findElement(using = "xpath", value = "//*[@id='search']") # 替换为实际的搜索按钮XPath
  Sys.sleep(runif(1, 3, 6))#页面加载，不然获取的页数可能有问题
  search_button$clickElement()
  
  # 打印进度
  cat(sprintf("Progress: %s",str_c( search_content[],"的数据库信息已开始提取，当前时间：",Sys.time())),sep='\n')
  #内循环的数据存储都是临时，如果不清空就累积
  bio_database_eachword<- data.frame()
  loop_times=0#初始循环次数
  # 设置每1页显示数据条数
  Sys.sleep(runif(1, 6, 9))
  select_pagenum_per <- remdr$findElement(using = "xpath", value = "//div[@class='selection ui dropdown']")
  select_pagenum_per$clickElement()
  
  Sys.sleep(runif(1, 5, 8))#页面加载，不然设置的条数可能有问题
  select_pagenum <- remdr$findElement(using = "xpath", value = "//div[@class='menu transition visible']//div[@data-value='20']")
  select_pagenum$clickElement()
  
  

  Sys.sleep(runif(3, 5, 7))
  
  # 4. 重新获取 HTML 内容（非常重要）
  page_html_source_new <- remdr$getPageSource()[[1]]
  page_html_content_new <- rvest::read_html(page_html_source_new)
  
  page_num_max <- page_html_content_new   %>%
    html_nodes(xpath = "//a[@class='paginate_button item ' and @data-dt-idx='7']") %>%
    html_text()
  print(page_num_max)

#当前关键词数据抓取循环
  repeat{
    ptm <- proc.time()#记录消耗时间
    Sys.sleep(runif(1, 3, 6))
    # 获取完整HTML内容
    page_html_source <- remdr$getPageSource()[[1]]
    page_html_content <- rvest::read_html(page_html_source)
    
    #数据部分
    database_name=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]")%>%
      html_text()
    database_link_intro=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[1]")%>%
      html_attr('href')
    database_link=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[1]/a[2]")%>%
      html_attr('href')
    full_name=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[2]")%>%
      html_text()
    data_object=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[3]")%>%
      html_text()
    data_type=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[4]")%>%
      html_text()
    database_category=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[5]")%>%
      html_text()
    keyword=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[6]")%>%
      html_text()
    location=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[7]")%>%
      html_text()
    host_institution=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[8]")%>%
      html_text()
    founded_year=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[9]")%>%
      html_text()
    citation=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[10]")%>%
      html_text()
    z_index=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[11]")%>%
      html_text()
    description=page_html_content %>%
      html_nodes(xpath = "//table[@class='ui table celled dataTable']//tbody/tr/td[12]")%>%
      html_text()

    bio_database_pageeach=data.frame(database_name,
                                     database_link_intro,
                                         database_link,
                                         full_name,
                                         data_object,
                                         data_type,
                                         database_category,
                                         keyword,
                                         location,
                                         host_institution,
                                         founded_year,
                                         citation,
                                         z_index,
                                         description,
                                     # description_all,
                                     search_word=search_content[i])
    bio_database_eachword=bind_rows(bio_database_eachword,bio_database_pageeach)
    # print(description_all)
    #进度
    current_page=page_html_content %>%
      html_nodes(xpath = "//a[@class='paginate_button item active']")%>%
      html_text()
    cat(sprintf("Progress: %s",str_c( search_content[i],"的数据库信息已开始提取至第",current_page,"页，共",
                                      page_num_max,"页数据，当前时间：",Sys.time())),sep='\n')
    proc.time() - ptm
    # current_page_url=remdr$getCurrentUrl()[[1]]#获取当前页面的网址
    #当前页面下一页按钮情况
    current_page_nextbutton_back1= page_html_content   %>%
      html_node(xpath = "//a[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%
      html_attr("class")
    current_page_nextbutton_back2=page_html_content   %>%
      html_node(xpath = "//div[contains(@class, 'paginate_button') and contains(@class, 'next')]") %>%
      html_attr("class")
    current_page_nextbutton=if(is.na(current_page_nextbutton_back1)==TRUE){
      current_page_nextbutton_back2
    }else{
      current_page_nextbutton_back1
    }
    #是否跳出循环，当前是否到达最后一页
    loop_times=loop_times+1
    cat("当前页面按钮是否可以翻页",current_page_nextbutton,"第",loop_times,"页")
    # cat("翻页前后2个页面是否相同",current_page_nextbutton,"第",loop_times,"个",current_page_nextbutton==next_page_nextbutton,sep="\n")
    if(str_detect(current_page_nextbutton,'disabled')==TRUE){
      cat("已到达最后一页，下一页按钮不可点击",
          "当前页按钮：",current_page_nextbutton,sep='\n')
      break
    }
    handle_alert_if_exists(remdr)
    #翻页下一页操作
    nextpage_button_xpath <- "//a[@class='paginate_button item next']"
    nextpage_button <- remdr$findElement(using = "xpath", value = "//a[@class='paginate_button item next']") # 
    Sys.sleep(runif(1, 3, 6))
    nextpage_button$clickElement()#如果间隔很短点击可能翻页不能及时生效
    
    # 翻页后等待页面加载
    Sys.sleep(runif(1, 4, 7))
  }
  bio_database_all=bind_rows(bio_database_all,bio_database_eachword)
  Sys.sleep(runif(1, 5, 7))
  #清空搜索框内容
  # 替换为实际的搜索框XPath，但这个网页的搜索框又变化，搜索后有2个输入位置
  search_box <- remdr$findElement(using = "xpath", value = "//input[@id='q']")
  #这个定位的是小的搜索框
  # search_box <- remdr$findElement(using = "xpath", value = "//div[@class='ui action input']/input[@id='term']")
  
  Sys.sleep(runif(1, 3, 6))
  search_box$clearElement()#清除搜索框内容
  # remdr$close()
}