python
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pywinauto.application import Application
import win32clipboard as w
from selenium.webdriver.common.by import By
import os
import json
import requests
import threading
import re
import string
import random
import time
from urllib.parse import unquote
from concurrent.futures import ThreadPoolExecutor
# from bs4 import BeautifulSoup
# from moviepy.editor import VideoFileClip, AudioFileClip
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
"cookie": "",
'Origin': 'https://www.bilibili11.com/',
}
def setText(aString): # 写入剪切板
w.OpenClipboard()
w.EmptyClipboard()
w.SetClipboardText(aString)
w.CloseClipboard()
def run_Chrome():
app = Application().start(
r'c:\WINDOWS\System32\cmd.exe /c cd D:\\chrome-win64\\chrome-win64 && start chrome.exe --remote-debugging-port=9999',
create_new_console=True, wait_for_idle=False)
time.sleep(3) # 等待Chrome启动
# 设置Chrome WebDriver选项
# 使用selenium打开浏览器
options = webdriver.ChromeOptions()
options.add_argument('--disable-extensions')
options.add_argument('--no-sandbox') # run Chrome use root
options.add_argument('--disable-setuid-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
from selenium.webdriver.chrome.service import Service
service = Service(r"D:\chrome-win64\chrome-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service)
try:
# # # 尝试获取顶层窗口
# window = app.top_window()
# # 获取所有窗口的句柄
# window_handles = driver.window_handles
# 切换到第二个窗口
# driver.switch_to.window(window_handles[1])
# driver.implicitly_wait(3)
driver.get("https://www.bilibili.com/")
# driver.implicitly_wait(3)
# window.wait('ready')
time.sleep(3) # 等待窗口就绪
# window.close()
except Exception as e:
print("Error occurred while getting top window:", e)
return driver
def get_user_browser():
options = Options()
options.add_experimental_option("debuggerAddress", "127.0.0.1:9999")
driver = webdriver.Chrome(options=options)
driver.get('https://www.bilibili.com/')
time.sleep(15)
driver.get(url)
time.sleep(15)
return driver
def extract_audio(audio_file_path, mp3_path):
video_clip = None
audio = None
try:
video_clip = VideoFileClip(audio_file_path)
audio = video_clip.audio
audio.write_audiofile(mp3_path)
# except Exception as e:
# print(f"An error occurred: {e}")
finally:
if video_clip is not None:
video_clip.close()
if audio is not None:
audio.close()
# 示例
def generate_random_code(length=5):
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
def run(url):
driver = run_Chrome()
driver.get(url)
# result = []
# try:
# for j in range(8):
#
# time.sleep(1)
#
#
# for i in driver.find_elements(By.CSS_SELECTOR, "[class='bili-video-card__info--right']"):
#
# if 'www.bilibili.com' in i.find_element(By.TAG_NAME, "a").get_attribute('href') and i.find_element(
# By.TAG_NAME, "a").get_attribute('href') not in result:
# hhh=i.find_element(By.TAG_NAME, "a").get_attribute('href')
# result.append(hhh)
# print(hhh)
# print(len(result))
# driver.find_element(By.XPATH,
# '//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[2]/div/div/button[10]').click()
# except:
# pass
# with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.txt", 'a+', encoding='utf-8') as f:
# for rr in result:
# f.write(f'{rr}\n')
with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.txt", 'r', encoding='utf-8') as file:
# 逐行读取文件内容
for line in file:
url = line.strip() # 去除行尾换行符和首尾空格
if url:
print(url)
driver.get(url)
time.sleep(3.5)
try:
name=""
time_d = ""
title = ""
name = driver.find_element(By.CSS_SELECTOR, "[class='up-detail-top']").text
print(name)
try:
time_d = driver.find_elements(By.CSS_SELECTOR, "[class='pubdate-ip-text']")[1].text
print(time_d)
except Exception as e:
print("ddddd", e)
try:
title = driver.find_element(By.CSS_SELECTOR, "[class='video-info-title']").text
print(title)
except Exception as e:
try:
title = driver.find_element(By.CSS_SELECTOR, "h1[class='video-title special-text-indent']").get_attribute('data-title')
print(title)
except Exception as e:
print("fffff",e)
try:
data = {
'url': url,
'name': name,
'time_d': time_d,
'title': title
}
with open(r"D:\2025\0908\input\915\pipeline\aidcpipeline\novel_data\novel-dialogue-extraction\miwu.jsonl", 'a',
encoding='utf-8') as jsonl_file:
jsonl_file.write(json.dumps(data, ensure_ascii=False) + '\n')
except:
print("pass+++++")
except Exception as e:
print(e)
driver.close()
if __name__ == "__main__":
url = "https://search.bilibili.com/all?keyword=%E5%9C%B0%E9%93%81%E9%80%83%E7%94%9F%E8%BF%B7%E9%9B%BE%E8%8D%92%E5%B2%9B&from_source=webtop_search&spm_id_from=333.1007&search_source=5&duration=2"
run(url)
https://googlechromelabs.github.io/chrome-for-testing/#stable