准备:
获取所有的标签和链接,并存储在json文件中
python
from bs4 import BeautifulSoup
import requests
import extract
import Into_Tag
import read
import json
def get_info(filepath):
try:
with open(filepath,'r',encoding='utf-8')as file:
content=file.read()
except Exception as e:
print(f"{e}")
return content
#content=read.get_HTML()
def main(url):
try:
response=requests.get(url)
content=response.text
read.trace_web(content)
except:
print('WLAN Error')
pass
return
def clear_f(filename):
with open(filename,'w',encoding='utf-8')as file:
pass
def store_tag(content):
all_tag = read.trace_web(content)
with open(Tag_file, 'w', encoding='utf-8') as file:
for data in all_tag:
json.dump(data, file)
file.write('\n')
return
fp = r"E:\Procedure\Python\Experiment\webpage.txt"
url='https://huggingface.co/datasets'
Tag_file=r'E:\Procedure\Python\Experiment\url_queue.json'
f=r"E:\Procedure\Python\Experiment\memo.json"
print('Succeed')
工具函数定义:
python
import requests
from bs4 import BeautifulSoup
import Into_Tag
def get_HTML():
web='https://huggingface.co/datasets'
try:
response=requests.get(web,timeout=30)
source_path=r"E:\Procedure\Python\Experiment\webpage.txt"
with open(source_path,'w',encoding='utf-8')as file:
file.write(response.text)
return response.text
except:
return None
def trace_web(content):
soup=BeautifulSoup(content,'html.parser')
tag_divs = soup.find_all('div', class_='mb-3')
all_tag=[]
for tag_div in tag_divs:
# 提取标签文本
try:
tag_text = tag_div.find('div', class_='mb-3 text-sm font-medium text-gray-500').get_text(strip=True)
"""
print("标签:", tag_text)
"""
# 提取小标签及其链接
small_tags = tag_div.find_all('a', class_='tag tag-white')
for small_tag_element in small_tags:
try:
small_tag_text = small_tag_element.span.get_text(strip=True)
small_tag_link = small_tag_element['href']
print("小标签:", small_tag_text)
print("小标签链接:", small_tag_link)
all_tag.append({'link':small_tag_link,'mainclass':tag_text,'subclass':small_tag_text})
except AttributeError as e:
continue
print("-" * 50)
except AttributeError as e:
continue
return all_tag
开始爬取:
Into_Tag.py
python
import requests
from bs4 import BeautifulSoup
import json
import extract
base='https://huggingface.co'
tbase='https://huggingface.co/datasets'
def obtain_url():
filename=r"E:\Procedure\Python\Experiment\url_queue.json"
url_list=[]
try:
with open(filename,'r')as file:
for line in file:
data=json.loads(line)
url_list.append(data)
except Exception as e:
print('Did not read:',e)
return url_list
def store_Web(result):
filename = r"E:\Procedure\Python\Experiment\res1.json"
if (result==None):
return
print(filename)
with open(filename, 'a') as file:
for data in result:
json.dump(data, file)
file.write('\n')
return
def judge(links,tag,s_tag):
filename=r"E:\Procedure\Python\Experiment\memo1.json"
print(filename)
dt = {}
result=[]
try:
with open(filename, 'r') as file:
dt = json.load(file)
except FileNotFoundError as e:
dt={}
pass
except json.decoder.JSONDecodeError as e:
dt={}
pass
if links==None or len(links)==0:
print("links is empty")
return
for lk in links:
if lk==None:
continue
link=lk.get('href')
if link==None:
continue
if link in dt:
continue
else:
try:
response=requests.get(base+link,timeout=20)
content=response.text
res=extract.extract_info(content,tag,s_tag)
dt.update({link:0})
result.append(res)
except requests.exceptions.Timeout as e:
print('Timeout while accessing:', base + link)
continue
except Exception as e:
print('Error while accessing:', base + link, e)
continue
try:
with open(filename,'w')as file:
json.dump(dt,file)
print('memo')
except Exception as e:
print('false')
pass
return result
def get_page_num(soup):
"""
获取页面数量
:param soup:
:return:
"""
li=soup.find_all('li',class_='hidden sm:block')
if li==None or len(li)==0:
return -1
num=int(li[-1].text)
return num
def one_by_one(url,mclass='',sclass=''):
try:
response=requests.get(base+url,timeout=10)
except Exception as e:
print(e)
return None
content=response.text
#print(content)
soup=BeautifulSoup(response.text,'html.parser')
#links=soup.find_all('a',class_='block p-2')
links=[]
num=get_page_num(soup)
if num==-1:
return links
for i in range(5,num):
params={'p':i,'sort':'trending'}
page_url=base+url
try:
content=requests.get(page_url,params=params,timeout=10).text
tsoup=BeautifulSoup(content,'html.parser')
tlinks=tsoup.find_all('a',class_='block p-2')
links+=tlinks
except:
continue
return links
def trytorep():
url = '/datasets?task_categories=task_categories%3Aimage-classification'
links = one_by_one(url)
res = judge(links, "Computer Vision", 'Image Classification')
store_Web(res)
def back():
url_list=obtain_url()
sum=45
for d in url_list[45:46]:
link = d.get('link')
mclass = d.get('mainclass')
sclass = d.get('subclass')
links = one_by_one(link)
res = judge(links, mclass, sclass)
store_Web(res)
print(sum)
sum+=1
def main():
url_list = obtain_url()
sum=5
for d in url_list[5:6]:
link = d.get('link')
mclass = d.get('mainclass')
sclass = d.get('subclass')
links = one_by_one(link)
res = judge(links, mclass, sclass)
store_Web(res)
print(sum)
sum+=1
def get_tag_size(url_list):
print("tag_size:",len(url_list))
d=url_list[5]
mclass=d.get('mainclass')
sclass=d.get('subclass')
links=[{'href':'/datasets/CyberHarem/surtr_arknights?not-for-all-audiences=true'}]
res=judge(links,mclass,sclass)
#store_Web(res)
return
url_list=obtain_url()
get_tag_size(url_list)
注意点:
注意如何正确翻页
注意特殊页面需要在链接后面加上 "?not-for-all-audiences=true" 才可以访问