获取dm音视频文案

python 复制代码
import re
import json
from urllib.parse import urlparse, parse_qs
import requests
import dashscope
from dashscope.audio.asr import Transcription


"""
Each file needs to export a function named `handler`. This function is the entrance to the Tool.

Parameters:
args: parameters of the entry function.
args.input - input parameters, you can get test input value by args.input.xxx.
args.logger - logger instance used to print logs, injected by runtime.

Remember to fill in input/output in Metadata, it helps LLM to recognize and use tool.

Return:
The return data of the function, which should match the declared output parameters.
"""

def get_video_id(text_or_url):
    try:

        # 1.提取视频地址
        #    格式1：4.66 B@t.EH 08/24 VYM:/ 有些私活真裁员降薪  https://v.douyin.com/i5WJFJWv/ 复制
        #    格式2：https://www.douyin.com/user/self?from_tab_name=main&modal_id=7452662497322077490
        #    格式3：https://www.douyin.com/video/7452662497322077490?modeFrom=userPost&secUid=MS4wLjABAAAACdnpzvOcEtvmZ8h9
        match_list = re.findall(r'https://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                                text_or_url)
        if not match_list:
            return False, "短视频地址格式错误"
        video_url = match_list[0]

        # 2.处理 短连接
        if video_url.startswith("https://v.douyin.com"):
            res = requests.get(video_url, allow_redirects=False)
            video_url = res.headers["Location"]

        parsed_url = urlparse(video_url)
        query_params = parse_qs(parsed_url.query)

        # 3.地址GET参数处的modal_id读取
        modal_id_list = query_params.get("modal_id")
        if modal_id_list:
            video_id = modal_id_list[0]
        else:
            path_list = parsed_url.path.strip("/").split("/")
            video_id = path_list[-1]

        return True, video_id
    except Exception as e:
        return False, "短视频ID提取失败"


def get_video_info(video_id):
    try:
        # 拼接生成m端地址
        res = requests.get(
            url=f"https://m.douyin.com/share/video/{video_id}",
            headers={
                "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
            }
        )

        # 正则提取 视频字典信息
        match_list = re.findall("window._ROUTER_DATA = (.*)</script>", res.text)
        data_dict = json.loads(match_list[0])
        video_info = data_dict['loaderData']["video_(id)/page"]["videoInfoRes"]['item_list'][0]
        # 视频信息
        nickname = video_info["author"]['nickname']
        digg_count = video_info["statistics"]['digg_count']
        comment_count = video_info["statistics"]['comment_count']
        desc = video_info["desc"]
        video_url = video_info["video"]['play_addr']["url_list"][0]

        return True, (nickname, desc, video_url, digg_count, comment_count)
    except Exception as e:
        return False, "视频信息提取失败"


def get_text_by_ali(api_key, video_url):
    try:
        # 1.阿里云百炼账号，创建并获取api_key
        dashscope.api_key = api_key#注册后输入

        # 2.调用模型提取文案
        # 文档：https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-recorded-speech-recognition-python-api?spm=a2c4g.11186623.help-menu-2400256.d_3_3_7_3_3_1.7ebf7f01D3wF0M
        #       https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-recorded-speech-recognition-python-api?spm=0.0.0.i1
        response = Transcription.call(
            model='paraformer-v1',
            file_urls=[video_url],
            language_hints=['zh', 'en']  # "language_hints"只支持paraformer-v2和paraformer-realtime-v2模型
        )

        # 3.读取结果  {"transcription_url":"提取文案结果的URL地址",...}
        status_code = response.get('status_code')
        if status_code != 200:
            return False, response.get('message', "通义千问提取失败")
        res_dict = response["output"]["results"][0]

        transcription_url = res_dict["transcription_url"]

        # 4.获取文案
        res = requests.get(transcription_url)
        res_dict = res.json()
        text = res_dict["transcripts"][0]["text"]

        return True, text
    except Exception as e:
        return False, "文案提取异常" + str(e)


def main():
    douyin_url = input.url
    ali_bailian_api_key = input.ali_bailian_api_key

    if not douyin_url.startswith("http"):
        douyin_url = re.search("https?://[^\\s]+", douyin_url).group()
    # 1.获取短视频ID
    status, video_id = get_video_id(douyin_url)
    # args.logger.info(video_id)
    if not status:
        return {"status": False, "error": video_id}

    # 2.获取视频信息（标题、作者、视频地址）
    status, video_info = get_video_info(video_id)
    if not status:
        return {"status": False, "error": str(video_info)}
    nickname, desc, video_url, digg_count, comment_count = video_info

    # 3.视频地址转换
    #   https://aweme.snssdk.com/aweme/v1/playwm/?video_id=v0300f
    #   https://aweme.snssdk.com/aweme/v1/play/?video_id=v0300f
    video_url = video_url.replace("playwm", "play")

    # 4.调用 阿里云模型，提取视频文案
    # args.logger.info(video_url)
    status, text = get_text_by_ali(ali_bailian_api_key, video_url)
    if not status:
        return {"status": False, "error": text}

    return {"status": True, "data": {"nickname": nickname, "desc": desc, "text": text, "digg_count": digg_count,
                                  "comment_count": comment_count}}