判断两张图片是否完全一致
python
from PIL import Image
import numpy as np
def check_img_repeat():
"""
感知哈希算法(Perceptual Hashing Algorithm,简称pHash)能够生成图像的"指纹",即使在图像质量变化、缩放或其他形式的处理后,只要内容不变,
生成的指纹也会相似。这种算法常用于图像的相似性比较。
@param directory:
@return:
"""
def dhash(img, hash_size=8):
# 图像转成灰度图
img = img.convert('L').resize(
(hash_size + 1, hash_size),
Image.LANCZOS,
)
# 图像转为数组
pixels = np.array(img)
# 比相邻元素
diff = pixels[:, 1:] > pixels[:, :-1]
# 基于比较结果创建哈希串
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
# 比较两个图像哈希值的汉明距离,即不同位的数量
def hamming_distance(hash1, hash2):
return bin(hash1 ^ hash2).count('1')
img_1 = Image.open("1.png")
img_2 = Image.open("2.png")
# 计算每张图片的dhash值
hash1 = dhash(img_1)
hash2 = dhash(img_2)
print(hash1, hash2)
distance = hamming_distance(hash1, hash2)
# 根据汉明距离判断图片的相似性
print(f"Hamming distance between the images: {distance}")
if distance <= 5:
print("Images are similar.")
else:
print("Images are not similar.")
check_img_repeat()
存储在数据库中的远程图片,判断两张图片是否完全一致
python
from PIL import Image
import numpy as np
import requests
from io import BytesIO
import mysql.connector
def select_user():
# 连接到MySQL数据库
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="123456",
database="test"
)
# 创建游标对象
mycursor = mydb.cursor()
# 编写查询语句
sql = "SELECT uid,head FROM user"
# 执行查询
mycursor.execute(sql)
# 获取查询结果
result = mycursor.fetchall()
return result
# 打印查询结果
# for row in result:
# pass
# print(row)
def check_img_repeat():
def dhash(img, hash_size=8):
# 图像转成灰度图
img = img.convert('L').resize(
(hash_size + 1, hash_size),
Image.LANCZOS,
)
# 图像转为数组
pixels = np.array(img)
# 比相邻元素
diff = pixels[:, 1:] > pixels[:, :-1]
# 基于比较结果创建哈希串
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
# 比较两个图像哈希值的汉明距离,即不同位的数量
def hamming_distance(hash1, hash2):
return bin(hash1 ^ hash2).count('1')
result = select_user();
# print(result)
# return
for row in result:
# print(row[0],row[1])
# 使用requests库下载图片
response = requests.get(row[1])
# 检查是否成功下载
if response.status_code == 200:
# 将图篇内容转换为文件对象
image_file = BytesIO(response.content)
# 使用Image.open打开图片
img_1 = Image.open(image_file)
img_2 = Image.open("2.png")
# print(img_1, img_2)
# 计算每张图片的dhash值
hash1 = dhash(img_1)
hash2 = dhash(img_2)
# print(hash1, hash2)
distance = hamming_distance(hash1, hash2)
# 根据汉明距离判断图片的相似性
# print(f"Hamming distance between the images: {distance}")
if distance <= 5:
# print("Images are similar.")
print(row[0])
# else:
# print("Images are not similar.")
# return
else:
print(row[0],"无法下载图片")
check_img_repeat()
# select_user()
扒取豆瓣电影数据
python
# coding= utf-8
import re
from time import sleep
import requests
from lxml import etree
import random
import csv
import sys
import io
def get_info(url, name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Host': 'movie.douban.com',
}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
with open('info_html.txt', 'a', encoding='utf-8') as finfo:
finfo.write(resp.text)
finfo.close()
else:
print("Failed to retrieve the info webpage")
html = resp.text
tree = etree.HTML(html)
# 导演
dir = tree.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
# 电影类型
type_ = re.findall(r'property="v:genre">(.*?)</span>', html)
# print(type_)
type_ = '/'.join(type_)
# 国家
country = re.findall(r'地区:</span> (.*?)<br', html)[0]
# 上映时间
time = tree.xpath('//*[@id="content"]/h1/span[2]/text()')[0]
time = time[1:5]
# 评分
rate = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
# 评论人数
people = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
print(name, dir, type_, country, time, rate, people) # 打印结果
csvwriter.writerow((name, dir, type_, country, time, rate, people)) # 保存到文件中
def main(page, f):
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码
url = f'https://movie.douban.com/top250?start={page * 25}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
with open('db_html.txt', 'a', encoding='utf-8') as fh:
fh.write(resp.text)
fh.close()
else:
print("Failed to retrieve the webpage")
tree = etree.HTML(resp.text)
# 获取详情页的链接列表
href_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/@href')
# print(href_list)
# 获取电影名称列表
name_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
# print(name_list)
# return 0
for url, name in zip(href_list, name_list):
f.flush() # 刷新文件
try:
get_info(url, name) # 获取详情页的信息
except:
pass
sleep(1 + random.random()) # 休息
print(f'第{i + 1}页爬取完毕')
if __name__ == '__main__':
# 创建文件用于保存数据
with open('movie-xpath.csv', 'a', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
# 写入表头标题
csvwriter.writerow(('电影名称', '导演', '电影类型', '国家', '上映年份', '评分', '评论人数'))
for i in range(10): # 爬取10页
main(i, f) # 调用主函数
sleep(3 + random.random())
下载B站视频
python
# encoding:utf-8
import requests
import re
import json
import string
def getVideo(url=''):
if url == '':
print('输入url')
# cookie = "buvid3=B55CE6FD-BDE5-7A7E-6D15-5056D1DC26B006223infoc; b_nut=1692598706; i-wanna-go-back=-1; b_ut=7; _uuid=1DEC26E4-C42A-8677-92DB-BD55510F10CE5531111infoc; header_theme_version=CLOSE; rpdid=|(kmJYYJm|~J0J'uYmk~~Y|m|; buvid_fp_plain=undefined; enable_web_push=DISABLE; LIVE_BUVID=AUTO7916995162822739; buvid4=17F8841F-C90E-23E9-F0F8-F39B1755674207328-023082114-%2FUm7bEmk6WH%2BMPxiqcI27w%3D%3D; hit-dyn-v2=1; bmg_af_switch=1; fingerprint=216a6d83d71fc31af760069bab4c49cc; DedeUserID=238641929; DedeUserID__ckMd5=282f3b86a8708a1c; CURRENT_QUALITY=80; PVID=1; bmg_src_def_domain=i1.hdslb.com; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid_fp=216a6d83d71fc31af760069bab4c49cc; share_source_origin=COPY; bsource=search_baidu; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjAxNjg3NTUsImlhdCI6MTcxOTkwOTQ5NSwicGx0IjotMX0.2rBtLYKFVxpuUGGeyMbb081Z4MVma4KuvGOPmwv690o; bili_ticket_expires=1720168695; SESSDATA=6ae7177f%2C1735461555%2C7f05d%2A71CjDoEyQD7YszGwXEQrlpnnAYXb_MCtIR67USOzGM0hnyzvrySOAF3NZhzoolmDTfAEYSVnBTNm1DT3J4bWREOGhud05KNWtwcXhiMDdEa2JpT09VcFV1NDRBamlldlJQMzRHRnRwa3R6RXJneGxnVEMwOGhtVTRldGp2MEFBYnR6NW9pSHc5enpnIIEC; bili_jct=42b4f91c067f0f50156f4084415f0043; b_lsid=3182CAC7_1907645C025; sid=8kk5qhii; home_feed_column=5; browser_resolution=1920-919; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; msource=pc_web; deviceFingerprint=d89b9a00ea77b002ff6afa39fda650bd; bp_t_offset_238641929=949762121212624896"
headers = {
# Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
# "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
"Referer": url,
# User-Agent 用户代理, 表示浏览器/设备基本身份信息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
# "Cookie": cookie
}
# 发送请求
response = requests.get(url=url, headers=headers)
# html写入文本
if response.status_code == 200:
with open('videoHtml.txt', 'w', encoding='utf-8') as finfo:
finfo.write(response.text)
finfo.close()
else:
print("Failed to retrieve the info webpage")
html = response.text
# print(html)
# 解析数据: 提取视频标题
title = re.findall('title="(.*?)"', html)[0]
# print(title[0])
for t in title:
if t in '\\/:*?\"<>|':
title = title.replace(t, '')
# 提取视频信息
info = re.findall('window.__playinfo__=(.*?)</script>', html)[0]
# info -> json字符串转成json字典
json_data = json.loads(info)
# 提取视频链接
video_url = json_data['data']['dash']['video'][0]['baseUrl']
# print(video_url)
# 提取音频链接
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
# print(audio_url)
video_content = requests.get(url=video_url, headers=headers).content
# 获取音频内容
audio_content = requests.get(url=audio_url, headers=headers).content
# 保存数据
with open(f'{title}.mp4', mode='wb') as v:
v.write(video_content)
with open(f'{title}.mp3', mode='wb') as a:
a.write(audio_content)
url = "https://www.bilibili.com/video/BV16e4y1k7Rs/?spm_id_from=333.999.0.0&vd_source=d8494042caaa3e41d56c7e04cc1669dd"
getVideo(url)
下载视频字幕
python
# encoding:utf-8
import cv2
import pytesseract
import sys
# 加载视频
video_path = 'video.mp4'
video = cv2.VideoCapture(video_path)
# 设置字幕文件保存路径
output_file = 'subtitles.srt'
subtitles = []
# 逐帧处理视频
while True:
ret, frame = video.read()
if not ret:
break
# 将帧转换为灰度图像
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# 设置Tesseract路径
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# 使用Tesseract提取文本
text = pytesseract.image_to_string(gray)
# 获取当前帧的时间戳
timestamp = video.get(cv2.CAP_PROP_POS_MSEC)
# 将文本和时间戳添加到字幕列表中
subtitles.append((timestamp, text))
# 关闭视频流
video.release()
# 将字幕保存为SRT文件
with open(output_file, 'w') as f:
for i, (timestamp, text) in enumerate(subtitles):
start_time = int(timestamp)
end_time = int(timestamp) + 1000 # 假设每帧持续1秒钟
f.write(f'{i + 1}\n')
f.write(f'{start_time} --> {end_time}\n')
f.write(f'{text}\n\n')
pytesseract 识别图片中的文字
python
pythonCopy code
import pytesseract
from PIL import Image
# 设置Tesseract路径
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def ocr(image_path):
# 读取图片
image = Image.open(image_path)
# 文字识别
text = pytesseract.image_to_string(image, lang='eng')
return text
# 图片路径
image_path = 'image.jpg'
# 调用OCR函数
result = ocr(image_path)
# 打印识别结果
print(result)
未验证下载视频字幕
python
# encoding:utf-8
import cv2
import numpy as np
import pytesseract
def caption_region_extr(src_img, thresh_, v_cut_ratio_):
# to gray
imgray = src_img
if len(src_img.shape) == 3 and src_img.shape[-1] == 3:
imgray = cv2.cvtColor(src_img, cv2.COLOR_BGR2GRAY)
# binary
th, img_bn = cv2.threshold(imgray, thresh_, 255, cv2.THRESH_BINARY)
# vertical cut
crop_start = int(v_cut_ratio_ * img_bn.shape[0])
crop_end = img_bn.shape[0]
v_cut_img = img_bn[crop_start:crop_end, :]
# horizontal cut
h_left = 0
h_right = 0
for i in range(v_cut_img.shape[1]):
if np.any(v_cut_img[:, i] > 0):
h_left = i
break
for i in range(v_cut_img.shape[1] - 1, -1, -1):
if np.any(v_cut_img[:, i] > 0):
h_right = i
break
h_cut_img = np.zeros(1)
if (h_right - h_left) > 20:
# expand a little
h_left = max(h_left - 10, 0)
h_right = min(h_right + 10, v_cut_img.shape[1] - 1)
h_cut_img = v_cut_img[:, h_left:h_right + 1]
return h_cut_img
def Equal_(region_a_, region_b_, thresh_):
if region_a_.shape != region_b_.shape:
return False
a = region_a_.reshape(-1).astype(np.float64)
b = region_b_.reshape(-1).astype(np.float64)
a_norm = np.linalg.norm(a)
b_norm = np.linalg.norm(b)
similiarity = np.dot(a, b.T) / (a_norm * b_norm)
dist = 1. - similiarity
if dist > thresh_:
return False
else:
return True
# 获得视频的格式
videoCapture = cv2.VideoCapture('video.mp4')
# 获得码率及尺寸
# fps = videoCapture.get(cv2.CAP_PROP_FPS)
# size = (int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH)),
# int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
# fNUMS = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
# print(size)
# print(fps)
# 读帧
success, frame = videoCapture.read()
name_cnt = 0
crop_ratio_ = 0.9
pre_cap_region = np.zeros(1)
ocr_ = list()
b_reference_ = False
while success:
# cv2.imshow('windows', frame) #显示
# cv2.waitKey(int(1000/fps)) #延迟
cap_region = caption_region_extr(frame, 200, crop_ratio_)
# 设置Tesseract路径
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# first caption
if (len(pre_cap_region) == 1) and (len(cap_region.shape) != 1):
pre_cap_region = cap_region
if b_reference_:
img_name = "zimu" + str(name_cnt) + ".jpg"
cv2.imwrite(img_name, pre_cap_region)
name_cnt += 1
text = pytesseract.image_to_string(pre_cap_region)
ocr_.append(text)
if len(cap_region.shape) != 1:
if False == Equal_(cap_region, pre_cap_region, 0.1):
if b_reference_:
img_name = "zimu" + str(name_cnt) + ".jpg"
cv2.imwrite(img_name, cap_region)
name_cnt += 1
text = pytesseract.image_to_string(cap_region)
if text != ocr_[-1]:
ocr_.append(text)
pre_cap_region = cap_region
success, frame = videoCapture.read() # 获取下一帧
videoCapture.release()
with open("result.txt", "w") as wf:
for line in ocr_:
wf.writelines(line + "\n")
下载B站指定视频BV号的字幕
python
# encoding:utf-8
import os
import time
import requests
import json
def download_subtitle_json(bvid: str):
sub_dir = f'./{bvid}'
if not os.path.isdir(sub_dir):
os.mkdir(f'./{bvid}')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': f'https://www.bilibili.com/video/{bvid}/?p=1',
'Origin': 'https://www.bilibili.com',
'Connection': 'keep-alive',
'Cookie': "xxxx", // 替换cookie
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
}
resp = requests.get(f'https://www.bilibili.com/video/{bvid}/', headers=headers)
text = resp.text
aid = text[text.find('"aid"') + 6:]
aid = aid[:aid.find(',')]
cid_back = requests.get("http://api.bilibili.com/x/player/pagelist?bvid={}".format(bvid), headers=headers)
if cid_back.status_code != 200:
print('获取 playlist 失败')
cid_json = json.loads(cid_back.content)
for item in cid_json['data']:
cid = item['cid']
title = item['part'] + '.json'
for t in title:
if t in '\\/:*?\"<>|':
title = title.replace(t, '')
params = {
'aid': aid,
'cid': cid,
'isGaiaAvoided': 'false',
'web_location': '1315873',
'w_rid': '364cdf378b75ef6a0cee77484ce29dbb',
'wts': int(time.time()),
}
wbi_resp = requests.get('https://api.bilibili.com/x/player/wbi/v2', params=params, headers=headers)
if wbi_resp.status_code != 200:
print('获取 字幕链接 失败')
subtitle_links = wbi_resp.json()['data']["subtitle"]['subtitles']
if subtitle_links:
# 默认下载第一个字幕
subtitle_url = "https:" + subtitle_links[0]['subtitle_url']
subtitle_resp = requests.get(subtitle_url, headers=headers)
open(os.path.join(sub_dir, title), 'w', encoding='utf-8').write(subtitle_resp.text)
if __name__ == '__main__':
BVID = 'BV16e4y1k7Rs'//视频的BV号
download_subtitle_json(BVID)
python
import json
import sys
from multiprocessing import Process
from moviepy.editor import *
from paddleocr import PaddleOCR
# 对视频进行裁剪与缩放
clip = VideoFileClip('video.mp4')
print("Ori FPS:{} Duration:{} Height:{} Width:{}".format(clip.fps, clip.duration, clip.w, clip.h))
cut_clip = clip.crop(y2=clip.h - 11, height=70)
cut_clip = cut_clip.set_fps(3)
print("Cut FPS:{} Duration:{} Height:{} Width:{}".format(cut_clip.fps, cut_clip.duration, cut_clip.w, cut_clip.h))
epoch = 10
step = cut_clip.duration / epoch
# 截取多个片段
clips = []
index = 0
while index < epoch:
# 获取分段的起止时间
start = index * step
end = min(start + step, clip.duration)
if start < clip.duration:
sub_clip = cut_clip.subclip(start, end)
print("index: {} start: {} end: {}".format(index, start, end))
clips.append([start, sub_clip])
else:
break
index += 1
def process_frame_by_ocr(st, tmp_clip):
ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True)
frame_rate = 1 / 3
for cnt, cur_frame in enumerate(tmp_clip.iter_frames()):
cur_start = frame_rate * (cnt + 1) + st
try:
# det=True 表示在进行光学字符识别(OCR)之前,先对图像进行检测。
result = ocr.ocr(cur_frame, det=True)
if result is not None:
see = result[0][0][1]
cur_time = int(cur_start)
doc_json = {'st': cur_time, "text": see}
ocr_text = json.dumps(doc_json, ensure_ascii=False)
open('result.json', 'a', encoding='utf-8').write(ocr_text + '\n')
except Exception:
pass
if __name__ == '__main__':
sub_process = []
for real_start, sub_clip in clips:
process_frame_by_ocr(real_start, sub_clip)
pro = Process(target=process_frame_by_ocr, args=(start, sub_clip))
pro.start()