from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
import requests
from urllib.parse import urlparse
import os
from lxml import etree
from urllib.parse import urljoin
def get_pdf(cur_url):
proxies={'http':'192.168.1.122:1080','https':'192.168.1.122:1080'}
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0',\
'referer':'https://endeavor.dragonforms.com/'}
d=webdriver.Chrome()
d.get(cur_url)
d.execute_script("window.scrollTo(0, document.body.scrollHeight/2+200);")
time.sleep(10)
frame_id=d.find_element(By.XPATH,'//div[@class="embed center fullWidth" or @class="embed fullWidth"]//iframe').get_attribute('id')
iframe = d.find_element(By.ID, frame_id)
d.switch_to.frame(iframe)
d.find_element(By.XPATH,'//input[@id="id13"]').send_keys('[email protected]')
d.find_element(By.XPATH,'//input[@id="id1"]').send_keys('chen')
d.find_element(By.XPATH,'//input[@id="id2"]').send_keys('chen')
d.find_element(By.XPATH,'//input[@id="id10"]').send_keys('beijing')
try:
d.find_element(By.XPATH,'//input[@id="id4"]').send_keys('chen')
except Exception as e:
pass
d.find_element(By.XPATH,'//input[@id="id3"]').send_keys('beijing')
try:
d.find_element(By.XPATH,'//input[@id="id6"]').send_keys('beijing')
except Exception as e:
pass
try:
d.find_element(By.XPATH,'//input[@id="id9"]').send_keys('101300')
except Exception as e:
pass
try:
d.find_element(By.XPATH,'//input[@id="id11"]').send_keys('18518076020')
except Exception as e:
pass
#####多选框城市和
select_element = d.find_element(By.ID, "id7")
select = Select(select_element)
#select.select_by_index(2)
select.select_by_visible_text("CHINA")
select_element = d.find_element(By.ID, "id8")
select = Select(select_element)
select.select_by_visible_text("FOREIGN")
try:
select_element = d.find_element(By.ID, "id5082617")
select = Select(select_element)
select.select_by_visible_text("No")
except Exception as e:
pass
try:
select_element = d.find_element(By.ID, "id5082616")
select = Select(select_element)
select.select_by_visible_text("No")
except Exception as e:
pass
time.sleep(5)
d.switch_to.default_content()
d.execute_script("window.scrollTo(0, document.body.scrollHeight/2+200);")
d.switch_to.frame(iframe)
d.find_element(By.ID,"custombtn").click()
d.switch_to.default_content()
d.execute_script("window.scrollTo(0, document.body.scrollHeight/2-600);")
d.switch_to.frame(iframe)
url=d.find_element(By.XPATH,'//div[@class="downloadReport-btn"]/a').get_attribute('href')
parsed_url = urlparse(url)
pdf_name = os.path.basename(parsed_url.path)
f=open(pdf_name,'wb+')
f.write(requests.get(url,proxies=proxies,headers=headers).content)
f.close()
if name=="main":
#headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0'}
#proxies={'http':'192.168.1.122:1080','https':'192.168.1.122:1080'}
d=webdriver.Chrome()
d.get("https://www.militaryaerospace.com/white-papers")
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
eles=d.find_elements(By.XPATH,'//div[@class="items-wrapper"]//a[@class="title-wrapper"]')
#html=etree.HTML(requests.get('https://www.militaryaerospace.com/white-papers',headers=headers,proxies=proxies).text)
#url_list=html.xpath('//div[@class="items-wrapper"]//a[@class="title-wrapper"]/@href')
for ele in eles:
for i in range(3):
try:
cur_url=ele.get_attribute('href')
#cur_url=urljoin('https://www.militaryaerospace.com/white-papers',cur_url)
get_pdf(cur_url)
break
except Exception as e:
continue
#get_pdf(cur_url)