python - 技术栈

判断两张图片是否完全一致

python 复制代码

from PIL import Image

import numpy as np

def check_img_repeat():
    """
    感知哈希算法（Perceptual Hashing Algorithm，简称pHash）能够生成图像的"指纹"，即使在图像质量变化、缩放或其他形式的处理后，只要内容不变，
    生成的指纹也会相似。这种算法常用于图像的相似性比较。
    @param directory:
    @return:
    """

    def dhash(img, hash_size=8):
        # 图像转成灰度图
        img = img.convert('L').resize(
            (hash_size + 1, hash_size),
            Image.LANCZOS,
        )
        # 图像转为数组
        pixels = np.array(img)
        # 比相邻元素
        diff = pixels[:, 1:] > pixels[:, :-1]
        # 基于比较结果创建哈希串
        return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

    # 比较两个图像哈希值的汉明距离，即不同位的数量
    def hamming_distance(hash1, hash2):
        return bin(hash1 ^ hash2).count('1')

    img_1 = Image.open("1.png")
    img_2 = Image.open("2.png")
    # 计算每张图片的dhash值
    hash1 = dhash(img_1)
    hash2 = dhash(img_2)
    print(hash1, hash2)
    distance = hamming_distance(hash1, hash2)
    # 根据汉明距离判断图片的相似性
    print(f"Hamming distance between the images: {distance}")
    if distance <= 5:
        print("Images are similar.")
    else:
        print("Images are not similar.")


check_img_repeat()

存储在数据库中的远程图片，判断两张图片是否完全一致

python 复制代码

from PIL import Image

import numpy as np

import requests

from io import BytesIO

import mysql.connector

def select_user():
    # 连接到MySQL数据库
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="123456",
        database="test"
    )

    # 创建游标对象
    mycursor = mydb.cursor()

    # 编写查询语句
    sql = "SELECT uid,head FROM user"

    # 执行查询
    mycursor.execute(sql)

    # 获取查询结果
    result = mycursor.fetchall()
    return result

    # 打印查询结果
    # for row in result:
    #     pass
    #     print(row)


def check_img_repeat(): 
    def dhash(img, hash_size=8):
        # 图像转成灰度图
        img = img.convert('L').resize(
            (hash_size + 1, hash_size),
            Image.LANCZOS,
        )
        # 图像转为数组
        pixels = np.array(img)
        # 比相邻元素
        diff = pixels[:, 1:] > pixels[:, :-1]
        # 基于比较结果创建哈希串
        return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

    # 比较两个图像哈希值的汉明距离，即不同位的数量
    def hamming_distance(hash1, hash2):
        return bin(hash1 ^ hash2).count('1')

    result =  select_user();
    # print(result)
    # return
    for row in result:
        # print(row[0],row[1])

        # 使用requests库下载图片
        response = requests.get(row[1])
        # 检查是否成功下载
        if response.status_code == 200:
            # 将图篇内容转换为文件对象
            image_file = BytesIO(response.content)
            # 使用Image.open打开图片
            img_1 = Image.open(image_file)
            img_2 = Image.open("2.png")
            # print(img_1, img_2)
            # 计算每张图片的dhash值
            hash1 = dhash(img_1)
            hash2 = dhash(img_2)
            # print(hash1, hash2)
            distance = hamming_distance(hash1, hash2)
            # 根据汉明距离判断图片的相似性
            # print(f"Hamming distance between the images: {distance}")
            if distance <= 5:
                # print("Images are similar.")
                print(row[0])
            # else:
            #     print("Images are not similar.")
            # return
        else:
            print(row[0],"无法下载图片")

check_img_repeat()
# select_user()

扒取豆瓣电影数据

python 复制代码

# coding= utf-8
import re
from time import sleep
import requests
from lxml import etree
import random
import csv
import sys
import io



def get_info(url, name):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Host': 'movie.douban.com',
    }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        with open('info_html.txt', 'a', encoding='utf-8') as finfo:
            finfo.write(resp.text)
            finfo.close()
    else:
        print("Failed to retrieve the info webpage")

    html = resp.text
    tree = etree.HTML(html)
    # 导演
    dir = tree.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
    # 电影类型
    type_ = re.findall(r'property="v:genre">(.*?)</span>', html)
    # print(type_)
    type_ = '/'.join(type_)
    # 国家
    country = re.findall(r'地区:</span> (.*?)<br', html)[0]
    # 上映时间
    time = tree.xpath('//*[@id="content"]/h1/span[2]/text()')[0]
    time = time[1:5]
    # 评分
    rate = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
    # 评论人数
    people = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
    print(name, dir, type_, country, time, rate, people)  # 打印结果
    csvwriter.writerow((name, dir, type_, country, time, rate, people))  # 保存到文件中


def main(page, f):
    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')  # 改变标准输出的默认编码
    url = f'https://movie.douban.com/top250?start={page * 25}&filter='
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }
    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        with open('db_html.txt', 'a', encoding='utf-8') as fh:
            fh.write(resp.text)
            fh.close()
    else:
        print("Failed to retrieve the webpage")
    tree = etree.HTML(resp.text)
    # 获取详情页的链接列表
    href_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/@href')
    # print(href_list)
    # 获取电影名称列表
    name_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
    # print(name_list)
    # return 0
    for url, name in zip(href_list, name_list):
        f.flush()  # 刷新文件
        try:
            get_info(url, name)  # 获取详情页的信息
        except:
            pass
        sleep(1 + random.random())  # 休息
    print(f'第{i + 1}页爬取完毕')

if __name__ == '__main__':
    # 创建文件用于保存数据
    with open('movie-xpath.csv', 'a', encoding='utf-8', newline='') as f:
        csvwriter = csv.writer(f)
        # 写入表头标题
        csvwriter.writerow(('电影名称', '导演', '电影类型', '国家', '上映年份', '评分', '评论人数'))
        for i in range(10):  # 爬取10页
            main(i, f)  # 调用主函数
            sleep(3 + random.random())

下载B站视频

python 复制代码

# encoding:utf-8
import requests
import re
import json
import string


def getVideo(url=''):
    if url == '':
        print('输入url')
    # cookie = "buvid3=B55CE6FD-BDE5-7A7E-6D15-5056D1DC26B006223infoc; b_nut=1692598706; i-wanna-go-back=-1; b_ut=7; _uuid=1DEC26E4-C42A-8677-92DB-BD55510F10CE5531111infoc; header_theme_version=CLOSE; rpdid=|(kmJYYJm|~J0J'uYmk~~Y|m|; buvid_fp_plain=undefined; enable_web_push=DISABLE; LIVE_BUVID=AUTO7916995162822739; buvid4=17F8841F-C90E-23E9-F0F8-F39B1755674207328-023082114-%2FUm7bEmk6WH%2BMPxiqcI27w%3D%3D; hit-dyn-v2=1; bmg_af_switch=1; fingerprint=216a6d83d71fc31af760069bab4c49cc; DedeUserID=238641929; DedeUserID__ckMd5=282f3b86a8708a1c; CURRENT_QUALITY=80; PVID=1; bmg_src_def_domain=i1.hdslb.com; FEED_LIVE_VERSION=V_HEADER_LIVE_NEW_POP; buvid_fp=216a6d83d71fc31af760069bab4c49cc; share_source_origin=COPY; bsource=search_baidu; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjAxNjg3NTUsImlhdCI6MTcxOTkwOTQ5NSwicGx0IjotMX0.2rBtLYKFVxpuUGGeyMbb081Z4MVma4KuvGOPmwv690o; bili_ticket_expires=1720168695; SESSDATA=6ae7177f%2C1735461555%2C7f05d%2A71CjDoEyQD7YszGwXEQrlpnnAYXb_MCtIR67USOzGM0hnyzvrySOAF3NZhzoolmDTfAEYSVnBTNm1DT3J4bWREOGhud05KNWtwcXhiMDdEa2JpT09VcFV1NDRBamlldlJQMzRHRnRwa3R6RXJneGxnVEMwOGhtVTRldGp2MEFBYnR6NW9pSHc5enpnIIEC; bili_jct=42b4f91c067f0f50156f4084415f0043; b_lsid=3182CAC7_1907645C025; sid=8kk5qhii; home_feed_column=5; browser_resolution=1920-919; CURRENT_BLACKGAP=0; CURRENT_FNVAL=4048; msource=pc_web; deviceFingerprint=d89b9a00ea77b002ff6afa39fda650bd; bp_t_offset_238641929=949762121212624896"
    headers = {
        # Referer 防盗链 告诉服务器你请求链接是从哪里跳转过来的
        # "Referer": "https://www.bilibili.com/video/BV1454y187Er/",
        "Referer": url,
        # User-Agent 用户代理, 表示浏览器/设备基本身份信息
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        # "Cookie": cookie
    }
    # 发送请求
    response = requests.get(url=url, headers=headers)

    # html写入文本
    if response.status_code == 200:
        with open('videoHtml.txt', 'w', encoding='utf-8') as finfo:
            finfo.write(response.text)
            finfo.close()
    else:
        print("Failed to retrieve the info webpage")

    html = response.text

    # print(html)
    # 解析数据: 提取视频标题
    title = re.findall('title="(.*?)"', html)[0]
    # print(title[0])
    for t in title:
        if t in '\\/:*?\"<>|':
            title = title.replace(t, '')

    # 提取视频信息
    info = re.findall('window.__playinfo__=(.*?)</script>', html)[0]
    # info -> json字符串转成json字典
    json_data = json.loads(info)

    # 提取视频链接
    video_url = json_data['data']['dash']['video'][0]['baseUrl']
    # print(video_url)
    # 提取音频链接
    audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
    # print(audio_url)
    video_content = requests.get(url=video_url, headers=headers).content
    # 获取音频内容
    audio_content = requests.get(url=audio_url, headers=headers).content

    # 保存数据
    with open(f'{title}.mp4', mode='wb') as v:
        v.write(video_content)
    with open(f'{title}.mp3', mode='wb') as a:
        a.write(audio_content)

url = "https://www.bilibili.com/video/BV16e4y1k7Rs/?spm_id_from=333.999.0.0&vd_source=d8494042caaa3e41d56c7e04cc1669dd"
getVideo(url)

下载视频字幕

Tesseract OCR

python 复制代码

# encoding:utf-8

import cv2
import pytesseract
import sys
# 加载视频
video_path = 'video.mp4'
video = cv2.VideoCapture(video_path)

# 设置字幕文件保存路径
output_file = 'subtitles.srt'
subtitles = []

# 逐帧处理视频
while True:
    ret, frame = video.read()
    if not ret:
        break

    # 将帧转换为灰度图像
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # 设置Tesseract路径
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # 使用Tesseract提取文本
    text = pytesseract.image_to_string(gray)

    # 获取当前帧的时间戳
    timestamp = video.get(cv2.CAP_PROP_POS_MSEC)

    # 将文本和时间戳添加到字幕列表中
    subtitles.append((timestamp, text))

# 关闭视频流
video.release()

# 将字幕保存为SRT文件
with open(output_file, 'w') as f:
    for i, (timestamp, text) in enumerate(subtitles):
        start_time = int(timestamp)
        end_time = int(timestamp) + 1000  # 假设每帧持续1秒钟
        f.write(f'{i + 1}\n')
        f.write(f'{start_time} --> {end_time}\n')
        f.write(f'{text}\n\n')

pytesseract 识别图片中的文字

python 复制代码

pythonCopy code
import pytesseract
from PIL import Image
# 设置Tesseract路径
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def ocr(image_path):
    # 读取图片
    image = Image.open(image_path)
    # 文字识别
    text = pytesseract.image_to_string(image, lang='eng')
    return text
# 图片路径
image_path = 'image.jpg'
# 调用OCR函数
result = ocr(image_path)
# 打印识别结果
print(result)

未验证下载视频字幕

python 复制代码

# encoding:utf-8

import cv2
import numpy as np
import pytesseract


def caption_region_extr(src_img, thresh_, v_cut_ratio_):
    # to gray
    imgray = src_img
    if len(src_img.shape) == 3 and src_img.shape[-1] == 3:
        imgray = cv2.cvtColor(src_img, cv2.COLOR_BGR2GRAY)
    # binary
    th, img_bn = cv2.threshold(imgray, thresh_, 255, cv2.THRESH_BINARY)
    # vertical cut
    crop_start = int(v_cut_ratio_ * img_bn.shape[0])
    crop_end = img_bn.shape[0]
    v_cut_img = img_bn[crop_start:crop_end, :]
    # horizontal cut
    h_left = 0
    h_right = 0
    for i in range(v_cut_img.shape[1]):
        if np.any(v_cut_img[:, i] > 0):
            h_left = i
            break
    for i in range(v_cut_img.shape[1] - 1, -1, -1):
        if np.any(v_cut_img[:, i] > 0):
            h_right = i
            break
    h_cut_img = np.zeros(1)
    if (h_right - h_left) > 20:
        # expand a little
        h_left = max(h_left - 10, 0)
        h_right = min(h_right + 10, v_cut_img.shape[1] - 1)
        h_cut_img = v_cut_img[:, h_left:h_right + 1]

    return h_cut_img


def Equal_(region_a_, region_b_, thresh_):
    if region_a_.shape != region_b_.shape:
        return False
    a = region_a_.reshape(-1).astype(np.float64)
    b = region_b_.reshape(-1).astype(np.float64)
    a_norm = np.linalg.norm(a)
    b_norm = np.linalg.norm(b)
    similiarity = np.dot(a, b.T) / (a_norm * b_norm)
    dist = 1. - similiarity
    if dist > thresh_:
        return False
    else:
        return True


# 获得视频的格式
videoCapture = cv2.VideoCapture('video.mp4')

# 获得码率及尺寸
# fps = videoCapture.get(cv2.CAP_PROP_FPS)
# size = (int(videoCapture.get(cv2.CAP_PROP_FRAME_WIDTH)),
#  int(videoCapture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
# fNUMS = videoCapture.get(cv2.CAP_PROP_FRAME_COUNT)
# print(size)
# print(fps)

# 读帧
success, frame = videoCapture.read()

name_cnt = 0

crop_ratio_ = 0.9
pre_cap_region = np.zeros(1)
ocr_ = list()

b_reference_ = False

while success:
    # cv2.imshow('windows', frame) #显示
    # cv2.waitKey(int(1000/fps)) #延迟

    cap_region = caption_region_extr(frame, 200, crop_ratio_)
    # 设置Tesseract路径
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # first caption
    if (len(pre_cap_region) == 1) and (len(cap_region.shape) != 1):
        pre_cap_region = cap_region
        if b_reference_:
            img_name = "zimu" + str(name_cnt) + ".jpg"
            cv2.imwrite(img_name, pre_cap_region)
            name_cnt += 1
        text = pytesseract.image_to_string(pre_cap_region)

        ocr_.append(text)

    if len(cap_region.shape) != 1:
        if False == Equal_(cap_region, pre_cap_region, 0.1):
            if b_reference_:
                img_name = "zimu" + str(name_cnt) + ".jpg"
                cv2.imwrite(img_name, cap_region)
                name_cnt += 1
            text = pytesseract.image_to_string(cap_region)

            if text != ocr_[-1]:
                ocr_.append(text)
        pre_cap_region = cap_region

    success, frame = videoCapture.read()  # 获取下一帧

videoCapture.release()

with open("result.txt", "w") as wf:
    for line in ocr_:

        wf.writelines(line + "\n")

下载B站指定视频BV号的字幕

python 复制代码

# encoding:utf-8

import os
import time
import requests
import json

def download_subtitle_json(bvid: str): 
    sub_dir = f'./{bvid}'
    if not os.path.isdir(sub_dir):
        os.mkdir(f'./{bvid}')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': f'https://www.bilibili.com/video/{bvid}/?p=1',
        'Origin': 'https://www.bilibili.com',
        'Connection': 'keep-alive',
        'Cookie': "xxxx", // 替换cookie
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-site',
    }
    resp = requests.get(f'https://www.bilibili.com/video/{bvid}/', headers=headers)
    text = resp.text
    aid = text[text.find('"aid"') + 6:]
    aid = aid[:aid.find(',')]
    cid_back = requests.get("http://api.bilibili.com/x/player/pagelist?bvid={}".format(bvid), headers=headers)
    if cid_back.status_code != 200:
        print('获取 playlist 失败')

    cid_json = json.loads(cid_back.content)
    for item in cid_json['data']:
        cid = item['cid']
        title = item['part'] + '.json'
        
        for t in title:
            if t in '\\/:*?\"<>|':
                title = title.replace(t, '')
                
        params = {
            'aid': aid,
            'cid': cid,
            'isGaiaAvoided': 'false',
            'web_location': '1315873',
            'w_rid': '364cdf378b75ef6a0cee77484ce29dbb',
            'wts': int(time.time()),
        }

        wbi_resp = requests.get('https://api.bilibili.com/x/player/wbi/v2', params=params, headers=headers)
        if wbi_resp.status_code != 200:
            print('获取 字幕链接 失败')
        subtitle_links = wbi_resp.json()['data']["subtitle"]['subtitles']
        if subtitle_links:
            # 默认下载第一个字幕
            subtitle_url = "https:" + subtitle_links[0]['subtitle_url']
            subtitle_resp = requests.get(subtitle_url, headers=headers)
            open(os.path.join(sub_dir, title), 'w', encoding='utf-8').write(subtitle_resp.text)


if __name__ == '__main__':
    BVID = 'BV16e4y1k7Rs'//视频的BV号
    download_subtitle_json(BVID)

利用 OCR 技术提取视频台词、字幕

python 复制代码

import json
import sys
from multiprocessing import Process

from moviepy.editor import *

from paddleocr import PaddleOCR

# 对视频进行裁剪与缩放
clip = VideoFileClip('video.mp4')

print("Ori FPS:{} Duration:{} Height:{} Width:{}".format(clip.fps, clip.duration, clip.w, clip.h))

cut_clip = clip.crop(y2=clip.h - 11, height=70)
cut_clip = cut_clip.set_fps(3)
print("Cut FPS:{} Duration:{} Height:{} Width:{}".format(cut_clip.fps, cut_clip.duration, cut_clip.w, cut_clip.h))



epoch = 10
step = cut_clip.duration / epoch

# 截取多个片段
clips = []
index = 0
while index < epoch:
    # 获取分段的起止时间
    start = index * step
    end = min(start + step, clip.duration)

    if start < clip.duration:
        sub_clip = cut_clip.subclip(start, end)
        print("index: {} start: {} end: {}".format(index, start, end))
        clips.append([start, sub_clip])
    else:
        break
    index += 1


def process_frame_by_ocr(st, tmp_clip):
    ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True)
    frame_rate = 1 / 3

    for cnt, cur_frame in enumerate(tmp_clip.iter_frames()):
        cur_start = frame_rate * (cnt + 1) + st

        try:
            # det=True 表示在进行光学字符识别（OCR）之前，先对图像进行检测。
            result = ocr.ocr(cur_frame, det=True)
            if result is not None:
                see = result[0][0][1]
                cur_time = int(cur_start)
                doc_json = {'st': cur_time, "text": see}
                ocr_text = json.dumps(doc_json, ensure_ascii=False)
                open('result.json', 'a', encoding='utf-8').write(ocr_text + '\n')
        except Exception:
            pass

if __name__ == '__main__':
    sub_process = []
    for real_start, sub_clip in clips:
        process_frame_by_ocr(real_start, sub_clip)
        pro = Process(target=process_frame_by_ocr, args=(start, sub_clip))
        pro.start()