python爬取B站视频

参考:https://cloud.tencent.com/developer/article/1768680

参考的代码有点问题,请求头需要修改,上代码:

python 复制代码
import requests
import re  # 正则表达式
import pprint
import json
from moviepy.editor import AudioFileClip, VideoFileClip
from bs4 import BeautifulSoup as bs

headers = {
    # 防盗链 告诉服务器 我们请求的url网址是从哪里跳转过来的
    'referer': 'https://www.bilibili.com/a',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}

def send_request(url):
    response = requests.get(url=url, headers=headers)
    return response

def get_video_data(html_data):
    """解析视频数据"""

    # 提取视频的标题
    soup = bs(html_data, 'lxml')
    title = soup.find_all(name='h1',attrs={"class":"video-title special-text-indent"})[0].get_text()
    # print(title)

    # 提取视频对应的json数据
    json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]
    # print(json_data)  # json_data 字符串
    json_data = json.loads(json_data)
    pprint.pprint(json_data)

    # 提取音频的url地址
    audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
    print('解析到的音频地址:', audio_url)

    # 提取视频画面的url地址
    video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
    print('解析到的视频地址:', video_url)

    video_data = [title, audio_url, video_url]
    return video_data

def save_data(file_name, audio_url, video_url):
    # 请求数据
    print('正在请求音频数据')
    audio_data = send_request(audio_url).content
    print('正在请求视频数据')
    video_data = send_request(video_url).content
    with open(file_name + '.mp3', mode='wb') as f:
        f.write(audio_data)
        print('正在保存音频数据')
    with open(file_name + '.mp4', mode='wb') as f:
        f.write(video_data)
        print('正在保存视频数据')

def merge_data(video_name):
    print('视频合成开始:', video_name)
    audioclip = AudioFileClip(video_name+'.mp3')
    videoclip = VideoFileClip(video_name+'.mp4')
    # 3.获取视频和音频的时长
    video_time = videoclip.duration
    audio_time = audioclip.duration
    # 4.对视频或者音频进行裁剪
    if video_time > audio_time:
        # 视频时长>音频时长,对视频进行截取
        videoclip_new = videoclip.subclip(0, audio_time)
        audioclip_new = audioclip
    else:
        # 音频时长>视频时长,对音频进行截取
        videoclip_new = videoclip
        audioclip_new = audioclip.subclip(0, video_time)
    # 5.视频中加入音频
    video_with_new_audio = videoclip_new.set_audio(audioclip_new)
    # 6.写入到新的视频文件中
    video_with_new_audio.write_videofile("output.mp4",
                                         codec='libx264',
                                         audio_codec='aac',
                                         temp_audiofile='temp-video.m4a',
                                         remove_temp=True
                                         )
    print('视频合成结束:', video_name)


url = 'https://www.bilibili.com/video/BV1bK421a7qG/?spm_id_from=333.1007.tianma.6-4-22.click'
response = send_request(url)
response.encoding = requests.utils.get_encodings_from_content(response.text)[0]
html_data = response.text
video_data = get_video_data(html_data)
save_data(video_data[0], video_data[1], video_data[2])
merge_data(video_data[0])

效果

小姐姐挺靓,就是左下角有水印,想办法去除水印,参考:python实战之去除视频水印&字幕_python 去除视频水印-CSDN博客

python 复制代码
import os
import sys
import cv2
import numpy
from moviepy import editor
 
TEMP_VIDEO = 'temp.mp4'
 
 
class WatermarkRemover():
 
    def __init__(self, video_path, output, threshold: int, kernel_size: int):
        self.threshold = threshold  # 阈值分割所用阈值
        self.kernel_size = kernel_size  # 膨胀运算核尺寸
        self.video_path = video_path
        self.output = output
 
 
    #根据用户手动选择的ROI(Region of Interest,感兴趣区域)框选水印或字幕位置。
    def select_roi(self, img: numpy.ndarray, hint: str) -> list:
        '''
    框选水印或字幕位置,SPACE或ENTER键退出
    :param img: 显示图片
    :return: 框选区域坐标
    '''
        COFF = 0.7
        w, h = int(COFF * img.shape[1]), int(COFF * img.shape[0])
        resize_img = cv2.resize(img, (w, h))
        roi = cv2.selectROI(hint, resize_img, False, False)
        cv2.destroyAllWindows()
        watermark_roi = [int(roi[0] / COFF), int(roi[1] / COFF), int(roi[2] / COFF), int(roi[3] / COFF)]
        return watermark_roi
 
 
    #对输入的蒙版进行膨胀运算,扩大蒙版的范围
    def dilate_mask(self, mask: numpy.ndarray) -> numpy.ndarray:
 
        '''
    对蒙版进行膨胀运算
    :param mask: 蒙版图片
    :return: 膨胀处理后蒙版
    '''
        kernel = numpy.ones((self.kernel_size, self.kernel_size), numpy.uint8)
        mask = cv2.dilate(mask, kernel)
        return mask
    
    #根据手动选择的ROI区域,在单帧图像中生成水印或字幕的蒙版。
    def generate_single_mask(self, img: numpy.ndarray, roi: list, threshold: int) -> numpy.ndarray:
        '''
    通过手动选择的ROI区域生成单帧图像的水印蒙版
    :param img: 单帧图像
    :param roi: 手动选择区域坐标
    :param threshold: 二值化阈值
    :return: 水印蒙版
    '''
        # 区域无效,程序退出
        if len(roi) != 4:
            print('NULL ROI!')
            sys.exit()
 
        # 复制单帧灰度图像ROI内像素点
        roi_img = numpy.zeros((img.shape[0], img.shape[1]), numpy.uint8)
        start_x, end_x = int(roi[1]), int(roi[1] + roi[3])
        start_y, end_y = int(roi[0]), int(roi[0] + roi[2])
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        roi_img[start_x:end_x, start_y:end_y] = gray[start_x:end_x, start_y:end_y]
 
        # 阈值分割
        _, mask = cv2.threshold(roi_img, threshold, 255, cv2.THRESH_BINARY)
        return mask
 
    #通过截取视频中多帧图像生成多张水印蒙版,并通过逻辑与计算生成最终的水印蒙版
    def generate_watermark_mask(self, video_path: str) -> numpy.ndarray:
        '''
    截取视频中多帧图像生成多张水印蒙版,通过逻辑与计算生成最终水印蒙版
    :param video_path: 视频文件路径
    :return: 水印蒙版
    '''
        video = cv2.VideoCapture(video_path)
        success, frame = video.read()
        roi = self.select_roi(frame, 'select watermark ROI')
        mask = numpy.ones((frame.shape[0], frame.shape[1]), numpy.uint8)
        mask.fill(255)
 
        step = video.get(cv2.CAP_PROP_FRAME_COUNT) // 5
        index = 0
        while success:
            if index % step == 0:
                mask = cv2.bitwise_and(mask, self.generate_single_mask(frame, roi, self.threshold))
            success, frame = video.read()
            index += 1
        video.release()
 
        return self.dilate_mask(mask)
 
    #根据手动选择的ROI区域,在单帧图像中生成字幕的蒙版。
    def generate_subtitle_mask(self, frame: numpy.ndarray, roi: list) -> numpy.ndarray:
        '''
    通过手动选择ROI区域生成单帧图像字幕蒙版
    :param frame: 单帧图像
    :param roi: 手动选择区域坐标
    :return: 字幕蒙版
    '''
        mask = self.generate_single_mask(frame, [0, roi[1], frame.shape[1], roi[3]], self.threshold)  # 仅使用ROI横坐标区域
        return self.dilate_mask(mask)
 
    def inpaint_image(self, img: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
        '''
    修复图像
    :param img: 单帧图像
    :parma mask: 蒙版
    :return: 修复后图像
    '''
        telea = cv2.inpaint(img, mask, 1, cv2.INPAINT_TELEA)
        return telea
 
 
    def merge_audio(self, input_path: str, output_path: str, temp_path: str):
        '''
    合并音频与处理后视频
    :param input_path: 原视频文件路径
    :param output_path: 封装音视频后文件路径
    :param temp_path: 无声视频文件路径
    '''
        with editor.VideoFileClip(input_path) as video:
            audio = video.audio
            with editor.VideoFileClip(temp_path) as opencv_video:
                clip = opencv_video.set_audio(audio)
                clip.to_videofile(output_path)
 
    def remove_video_watermark(self):
        '''
    去除视频水印
    '''
        if not os.path.exists(self.output):
            os.makedirs(self.output)
 
        filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]
        mask = None
 
        for i, name in enumerate(filenames):
            if i == 0:
                # 生成水印蒙版
                mask = self.generate_watermark_mask(name)
 
            # 创建待写入文件对象
            video = cv2.VideoCapture(name)
            fps = video.get(cv2.CAP_PROP_FPS)
            size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
            video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
 
            # 逐帧处理图像
            success, frame = video.read()
 
            while success:
                frame = self.inpaint_image(frame, mask)
                video_writer.write(frame)
                success, frame = video.read()
 
            video.release()
            video_writer.release()
 
            # 封装视频
            (_, filename) = os.path.split(name)
            output_path = os.path.join(self.output, filename.split('.')[0] + '_no_watermark.mp4')  # 输出文件路径
            self.merge_audio(name, output_path, TEMP_VIDEO)
 
    if os.path.exists(TEMP_VIDEO):
        os.remove(TEMP_VIDEO)
 
    def remove_video_subtitle(self):
        '''去除视频字幕'''
        if not os.path.exists(self.output):
            os.makedirs(self.output)
 
        filenames = [os.path.join(self.video_path, i) for i in os.listdir(self.video_path)]
        roi = []
 
        for i, name in enumerate(filenames):
            # 创建待写入文件对象
            video = cv2.VideoCapture(name)
            fps = video.get(cv2.CAP_PROP_FPS)
            size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
            video_writer = cv2.VideoWriter(TEMP_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
 
            # 逐帧处理图像
            success, frame = video.read()
            if i == 0:
                roi = self.select_roi(frame, 'select subtitle ROI')
 
            while success:
                mask = self.generate_subtitle_mask(frame, roi)
                frame = self.inpaint_image(frame, mask)
                video_writer.write(frame)
                success, frame = video.read()
 
            video.release()
            video_writer.release()
 
            # 封装视频
            (_, filename) = os.path.split(name)
            output_path = os.path.join(OUTPUT_PATH, filename.split('.')[0] + '_no_sub.mp4')  # 输出文件路径
            self.merge_audio(name, output_path, TEMP_VIDEO)
 
        if os.path.exists(TEMP_VIDEO):
            os.remove(TEMP_VIDEO)
 
 # 去水印
video_path = 'video'
output_path = 'output'
remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)
remover.remove_video_watermark()   
#去字幕
# remover = WatermarkRemover(video_path,output_path,threshold=80, kernel_size=5)
# remover.remove_video_subtitle()

效果一般吧:

相关推荐
精灵vector1 小时前
构建专家级SQL Agent交互
python·aigc·ai编程
q567315231 小时前
Java Selenium反爬虫技术方案
java·爬虫·selenium
Zonda要好好学习1 小时前
Python入门Day2
开发语言·python
Vertira1 小时前
pdf 合并 python实现(已解决)
前端·python·pdf
太凉2 小时前
Python之 sorted() 函数的基本语法
python
项目題供诗2 小时前
黑马python(二十四)
开发语言·python
晓13132 小时前
OpenCV篇——项目(二)OCR文档扫描
人工智能·python·opencv·pycharm·ocr
是小王同学啊~2 小时前
(LangChain)RAG系统链路向量检索器之Retrievers(五)
python·算法·langchain
AIGC包拥它3 小时前
提示技术系列——链式提示
人工智能·python·langchain·prompt
孟陬3 小时前
Python matplotlib 如何**同时**展示正文和 emoji
python