一、思路:
-
分析URL,图片的URL内嵌于base_url的返回当中
-
下载图片
二、代码
python
import time
import requests
import os
from lxml import etree
class DownloadImg():
'''
爬虫进行美女图片下载
'''
def __init__(self):
self.url = 'http://xxxxxx/4kmeinv/'
self.base_url = 'xxxxxxxxxx'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
self.page = 1
#创建图片存储目录
def create_img_dir(self):
current_dir = os.path.dirname(__file__)
img_dir = os.path.join(current_dir,'img')
if not os.path.exists(img_dir):
os.makedirs(img_dir)
return img_dir
#下载图片
def download_img(self,url_list):
for url in url_list:
res = requests.get(url=url,headers=self.headers).text
tree = etree.HTML(res)
self.create_img_dir()
#解析
for li in tree.xpath('//div[@class="slist"]/ul/li'):
img_url = li.xpath('./a/img/@src')[0]
full_img_url = self.base_url + img_url
print(full_img_url)
img_name = full_img_url.split('/')[-1]
full_img_name = os.path.join(self.create_img_dir(), img_name)
# 开始下载图片
with open(full_img_name, 'wb') as fs:
content = requests.get(url=full_img_url, headers=self.headers).content
fs.write(content)
print("{}图片下载完成 ".format(img_name))
time.sleep(1)
#生成图片URL,返回每个page组成的列表
def get_img_url(self,page):
url_list = [self.url]
if page == 1 :
return url_list
elif page > 1 :
'''
https://xxxxxxx/index_3.html
'''
for i in range(1,page+1):
if i == 1 :
continue
multi_url = self.url + "index_{}.html".format(str(page))
url_list.append(multi_url)
return url_list
if __name__ == '__main__':
#下载页数,2页
page = 2
#定义类对象
down_img = DownloadImg()
url = down_img.get_img_url(2)
print(url)
down_img.download_img(url)
print("图片全部下载完成,程序退出")