获取dm音视频文案

获取dm音视频文案

python 复制代码
import re
import json
from urllib.parse import urlparse, parse_qs
import requests
import dashscope
from dashscope.audio.asr import Transcription


"""
Each file needs to export a function named `handler`. This function is the entrance to the Tool.

Parameters:
args: parameters of the entry function.
args.input - input parameters, you can get test input value by args.input.xxx.
args.logger - logger instance used to print logs, injected by runtime.

Remember to fill in input/output in Metadata, it helps LLM to recognize and use tool.

Return:
The return data of the function, which should match the declared output parameters.
"""

def get_video_id(text_or_url):
    try:

        # 1.提取视频地址
        #    格式1:4.66 B@t.EH 08/24 VYM:/ 有些私活真裁员降薪  https://v.douyin.com/i5WJFJWv/ 复制
        #    格式2:https://www.douyin.com/user/self?from_tab_name=main&modal_id=7452662497322077490
        #    格式3:https://www.douyin.com/video/7452662497322077490?modeFrom=userPost&secUid=MS4wLjABAAAACdnpzvOcEtvmZ8h9
        match_list = re.findall(r'https://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                text_or_url)
        if not match_list:
            return False, "短视频地址格式错误"
        video_url = match_list[0]

        # 2.处理 短连接
        if video_url.startswith("https://v.douyin.com"):
            res = requests.get(video_url, allow_redirects=False)
            video_url = res.headers["Location"]

        parsed_url = urlparse(video_url)
        query_params = parse_qs(parsed_url.query)

        # 3.地址GET参数处的modal_id读取
        modal_id_list = query_params.get("modal_id")
        if modal_id_list:
            video_id = modal_id_list[0]
        else:
            path_list = parsed_url.path.strip("/").split("/")
            video_id = path_list[-1]

        return True, video_id
    except Exception as e:
        return False, "短视频ID提取失败"


def get_video_info(video_id):
    try:
        # 拼接生成m端地址
        res = requests.get(
            url=f"https://m.douyin.com/share/video/{video_id}",
            headers={
                "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
            }
        )

        # 正则提取 视频字典信息
        match_list = re.findall("window._ROUTER_DATA = (.*)</script>", res.text)
        data_dict = json.loads(match_list[0])
        video_info = data_dict['loaderData']["video_(id)/page"]["videoInfoRes"]['item_list'][0]
        # 视频信息
        nickname = video_info["author"]['nickname']
        digg_count = video_info["statistics"]['digg_count']
        comment_count = video_info["statistics"]['comment_count']
        desc = video_info["desc"]
        video_url = video_info["video"]['play_addr']["url_list"][0]

        return True, (nickname, desc, video_url, digg_count, comment_count)
    except Exception as e:
        return False, "视频信息提取失败"


def get_text_by_ali(api_key, video_url):
    try:
        # 1.阿里云百炼账号,创建并获取api_key
        dashscope.api_key = api_key#注册后输入

        # 2.调用模型提取文案
        # 文档:https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-recorded-speech-recognition-python-api?spm=a2c4g.11186623.help-menu-2400256.d_3_3_7_3_3_1.7ebf7f01D3wF0M
        #       https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-recorded-speech-recognition-python-api?spm=0.0.0.i1
        response = Transcription.call(
            model='paraformer-v1',
            file_urls=[video_url],
            language_hints=['zh', 'en']  # "language_hints"只支持paraformer-v2和paraformer-realtime-v2模型
        )

        # 3.读取结果  {"transcription_url":"提取文案结果的URL地址",...}
        status_code = response.get('status_code')
        if status_code != 200:
            return False, response.get('message', "通义千问提取失败")
        res_dict = response["output"]["results"][0]

        transcription_url = res_dict["transcription_url"]

        # 4.获取文案
        res = requests.get(transcription_url)
        res_dict = res.json()
        text = res_dict["transcripts"][0]["text"]

        return True, text
    except Exception as e:
        return False, "文案提取异常" + str(e)


def main():
    douyin_url = input.url
    ali_bailian_api_key = input.ali_bailian_api_key

    if not douyin_url.startswith("http"):
        douyin_url = re.search("https?://[^\\s]+", douyin_url).group()
    # 1.获取短视频ID
    status, video_id = get_video_id(douyin_url)
    # args.logger.info(video_id)
    if not status:
        return {"status": False, "error": video_id}

    # 2.获取视频信息(标题、作者、视频地址)
    status, video_info = get_video_info(video_id)
    if not status:
        return {"status": False, "error": str(video_info)}
    nickname, desc, video_url, digg_count, comment_count = video_info

    # 3.视频地址转换
    #   https://aweme.snssdk.com/aweme/v1/playwm/?video_id=v0300f
    #   https://aweme.snssdk.com/aweme/v1/play/?video_id=v0300f
    video_url = video_url.replace("playwm", "play")

    # 4.调用 阿里云模型,提取视频文案
    # args.logger.info(video_url)
    status, text = get_text_by_ali(ali_bailian_api_key, video_url)
    if not status:
        return {"status": False, "error": text}

    return {"status": True, "data": {"nickname": nickname, "desc": desc, "text": text, "digg_count": digg_count,
                                  "comment_count": comment_count}}
相关推荐
Flittly14 小时前
【从零手写 ClaudeCode:learn-claude-code 项目实战笔记】(3)TodoWrite (待办写入)
python·agent
千寻girling18 小时前
一份不可多得的 《 Django 》 零基础入门教程
后端·python·面试
databook1 天前
探索视觉的边界:用 Manim 重现有趣的知觉错觉
python·动效
明月_清风1 天前
Python 性能微观世界:列表推导式 vs for 循环
后端·python
明月_清风1 天前
Python 性能翻身仗:从 O(n) 到 O(1) 的工程实践
后端·python
helloweilei2 天前
python 抽象基类
python
用户8356290780512 天前
Python 实现 PPT 转 HTML
后端·python
zone77392 天前
004:RAG 入门-LangChain读取PDF
后端·python·面试
zone77392 天前
005:RAG 入门-LangChain读取表格数据
后端·python·agent
树獭非懒2 天前
AI大模型小白手册|Embedding 与向量数据库
后端·python·llm