bilibil - 技术栈

python 复制代码
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pywinauto.application import Application
import win32clipboard as w
from selenium.webdriver.common.by import By
import os
import json
import requests
import threading
import re
import string
import random
import time
from urllib.parse import unquote
from concurrent.futures import ThreadPoolExecutor
# from bs4 import BeautifulSoup
# from moviepy.editor import VideoFileClip, AudioFileClip
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
    "cookie": "",
    'Origin': 'https://www.bilibili11.com/',

}


def setText(aString):  # 写入剪切板
    w.OpenClipboard()
    w.EmptyClipboard()
    w.SetClipboardText(aString)
    w.CloseClipboard()


def run_Chrome():
    app = Application().start(
        r'c:\WINDOWS\System32\cmd.exe /c cd D:\\chrome-win64\\chrome-win64 && start chrome.exe --remote-debugging-port=9999',
        create_new_console=True, wait_for_idle=False)
    time.sleep(3)  # 等待Chrome启动
    # 设置Chrome WebDriver选项
    # 使用selenium打开浏览器
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-extensions')
    options.add_argument('--no-sandbox')  # run Chrome use root
    options.add_argument('--disable-setuid-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    from selenium.webdriver.chrome.service import Service

    service = Service(r"D:\chrome-win64\chrome-win64\chromedriver.exe")
    driver = webdriver.Chrome(service=service)

    try:
        # # # 尝试获取顶层窗口
        # window = app.top_window()
        # # 获取所有窗口的句柄
        # window_handles = driver.window_handles

        # 切换到第二个窗口
        # driver.switch_to.window(window_handles[1])
        # driver.implicitly_wait(3)
        driver.get("https://www.bilibili.com/")
        # driver.implicitly_wait(3)
        # window.wait('ready')
        time.sleep(3)  # 等待窗口就绪
        # window.close()
    except Exception as e:
        print("Error occurred while getting top window:", e)

    return driver


def get_user_browser():
    options = Options()
    options.add_experimental_option("debuggerAddress", "127.0.0.1:9999")
    driver = webdriver.Chrome(options=options)
    driver.get('https://www.bilibili.com/')
    time.sleep(15)
    driver.get(url)
    time.sleep(15)
    return driver


def extract_audio(audio_file_path, mp3_path):
    video_clip = None
    audio = None
    try:
        video_clip = VideoFileClip(audio_file_path)
        audio = video_clip.audio
        audio.write_audiofile(mp3_path)
    # except Exception as e:
    #     print(f"An error occurred: {e}")
    finally:
        if video_clip is not None:
            video_clip.close()
        if audio is not None:
            audio.close()


# 示例
def generate_random_code(length=5):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))


def run(url):
    driver = run_Chrome()
    driver.get(url)
    # result = []
    # try:
    #     for j in range(8):
    #
    #         time.sleep(1)
    #
    #
    #         for i in driver.find_elements(By.CSS_SELECTOR, "[class='bili-video-card__info--right']"):
    #
    #             if 'www.bilibili.com' in i.find_element(By.TAG_NAME, "a").get_attribute('href') and i.find_element(
    #                     By.TAG_NAME, "a").get_attribute('href') not in result:
    #                 hhh=i.find_element(By.TAG_NAME, "a").get_attribute('href')
    #                 result.append(hhh)
    #                 print(hhh)
    #         print(len(result))
    #         driver.find_element(By.XPATH,
    #                         '//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[2]/div/div/button[10]').click()
    # except:
    #     pass
    # with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.txt", 'a+', encoding='utf-8') as f:
    #     for rr in result:
    #         f.write(f'{rr}\n')


    with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.txt", 'r', encoding='utf-8') as file:
        # 逐行读取文件内容
        for line in file:
            url = line.strip()  # 去除行尾换行符和首尾空格
            if url:
                print(url)
                driver.get(url)
                time.sleep(3.5)
                try:
                    name=""
                    time_d = ""
                    title = ""

                    name = driver.find_element(By.CSS_SELECTOR, "[class='up-detail-top']").text
                    print(name)
                    try:
                        time_d = driver.find_elements(By.CSS_SELECTOR, "[class='pubdate-ip-text']")[1].text
                        print(time_d)
                    except Exception as e:
                        print("ddddd", e)
                    try:
                        title = driver.find_element(By.CSS_SELECTOR, "[class='video-info-title']").text
                        print(title)

                    except Exception as e:
                        try:
                            title = driver.find_element(By.CSS_SELECTOR, "h1[class='video-title special-text-indent']").get_attribute('data-title')
                            print(title)
                        except Exception as e:
                            print("fffff",e)


                    try:

                        data = {
                            'url': url,
                            'name': name,
                            'time_d': time_d,
                            'title': title
                        }
                        with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.jsonl", 'a',
                                  encoding='utf-8') as jsonl_file:
                            jsonl_file.write(json.dumps(data, ensure_ascii=False) + '\n')
                    except:
                        print("pass+++++")

                except Exception as e:
                    print(e)

    driver.close()



if __name__ == "__main__":

    url = "https://search.bilibili.com/all?keyword=%E5%9C%B0%E9%93%81%E9%80%83%E7%94%9F%E8%BF%B7%E9%9B%BE%E8%8D%92%E5%B2%9B&from_source=webtop_search&spm_id_from=333.1007&search_source=5&duration=2"
    run(url)
https://googlechromelabs.github.io/chrome-for-testing/#stable