【小白量化机器人】爬取财经新闻并利用本地大模型评分选择合适交易策略

【小白量化机器人】环境，提供了全球市场的实时行情和交易接口。我们利用本地大模型进行分析。

一般趋势行情选择趋势策略，震荡行情选择震荡策略。

我们利用爬虫，爬取最近2天的财经新闻。

再利用大模型进行评分。如果新闻平稳偏向看多，就使用趋势交易策略。

一、爬取新闻程序演示代码。

python 复制代码

import requests
from bs4 import BeautifulSoup
import os
from datetime import datetime, timedelta
import re
import sys
import io

# 设置系统编码为UTF-8
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

# 确保文件路径使用UTF-8编码
import locale
locale.setlocale(locale.LC_ALL, 'zh_CN.UTF-8')

def create_directory():
    """创建最新日期目录"""
    # 获取当前日期作为目录名
    today = datetime.now().strftime('%Y-%m-%d')
    directory = os.path.join(os.getcwd(), today)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

def clean_filename(filename):
    """清理文件名，去除非法字符"""
    # 去除非法字符
    filename = re.sub(r'[<>:/\\|?*]', '', filename)
    # 限制文件名长度
    if len(filename) > 100:
        filename = filename[:100]
    return filename

def crawl_financial_news():
    """爬取财经新闻"""
    # 爬取新浪财经新闻
    url = "https://finance.sina.com.cn/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        # 处理编码，确保正确解析GBK内容
        if 'charset' in response.headers:
            encoding = response.headers['charset']
        else:
            # 尝试自动检测编码
            encoding = response.apparent_encoding
        # 确保使用正确的编码解析内容
        response.encoding = encoding
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 创建最新日期目录
        directory = create_directory()
        
        # 提取新闻链接
        news_links = []
        # 查找新闻列表
        for link in soup.find_all('a', href=True):
            href = link['href']
            # 筛选财经新闻链接
            if ('/finance/' in href or '/money/' in href) and href.startswith('https://'):
                news_links.append(href)
        
        # 去重
        news_links = list(set(news_links))
        print(f"找到 {len(news_links)} 条新闻链接")
        
        # 计算2天前的日期
        two_days_ago = (datetime.now() - timedelta(days=2)).date()
        
        # 爬取每条新闻
        saved_count = 0
        for i, news_url in enumerate(news_links):  # 不限制数量，爬取所有链接
            try:
                news_response = requests.get(news_url, headers=headers, timeout=10)
                news_response.raise_for_status()
                # 处理编码，确保正确解析GBK内容
                if 'charset' in news_response.headers:
                    encoding = news_response.headers['charset']
                else:
                    # 尝试自动检测编码
                    encoding = news_response.apparent_encoding
                # 确保使用正确的编码解析内容
                news_response.encoding = encoding
                news_soup = BeautifulSoup(news_response.text, 'html.parser')
                
                # 提取标题
                title = news_soup.find('h1')
                if not title:
                    title = news_soup.find('h2')
                if title:
                    title_text = title.get_text(strip=True)
                else:
                    title_text = f"新闻{i}"
                
                # 提取发布时间
                publish_time = None
                # 尝试从meta标签提取发布时间
                meta_time = news_soup.find('meta', {'name': 'publishdate'})
                if meta_time:
                    publish_time = meta_time.get('content')
                # 尝试从时间标签提取
                if not publish_time:
                    time_tags = news_soup.find_all(['time', 'span'], class_=['time', 'pubtime', 'publish_time'])
                    for tag in time_tags:
                        time_text = tag.get_text(strip=True)
                        if re.search(r'\d{4}-\d{2}-\d{2}', time_text):
                            publish_time = time_text
                            break
                # 尝试从链接中提取日期
                if not publish_time:
                    date_match = re.search(r'/(\d{4}-\d{2}-\d{2})/', news_url)
                    if date_match:
                        publish_time = date_match.group(1)
                
                # 检查是否在最近2天内
                if publish_time:
                    try:
                        # 解析发布日期
                        if isinstance(publish_time, str):
                            # 提取日期部分
                            date_str = re.search(r'\d{4}-\d{2}-\d{2}', publish_time)
                            if date_str:
                                publish_date = datetime.strptime(date_str.group(0), '%Y-%m-%d').date()
                                # 检查是否在最近2天内
                                if publish_date >= two_days_ago:
                                    # 提取内容
                                    content = []
                                    content_div = news_soup.find('div', class_=['article', 'content', 'main-content'])
                                    if content_div:
                                        paragraphs = content_div.find_all('p')
                                        for p in paragraphs:
                                            text = p.get_text(strip=True)
                                            if text:
                                                content.append(text)
                                    
                                    if not content:
                                        # 尝试其他方式提取内容
                                        for p in news_soup.find_all('p'):
                                            text = p.get_text(strip=True)
                                            if len(text) > 50:  # 过滤短文本
                                                content.append(text)
                                    
                                    # 保存到文件
                                    if title_text and content:
                                        filename = clean_filename(title_text) + '.txt'
                                        file_path = os.path.join(directory, filename)
                                        
                                        with open(file_path, 'w', encoding='utf-8') as f:
                                            f.write(f"标题: {title_text}\n")
                                            f.write(f"链接: {news_url}\n")
                                            f.write(f"发布时间: {publish_time}\n")
                                            f.write(f"爬取时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
                                            f.write("内容:\n")
                                            f.write('\n'.join(content))
                                        
                                        print(f"已保存: {filename} (发布时间: {publish_time})")
                                        saved_count += 1
                    except Exception as e:
                        print(f"解析日期失败: {publish_time}, 错误: {str(e)}")
                
            except Exception as e:
                print(f"爬取新闻失败: {news_url}, 错误: {str(e)}")
        
        print(f"爬取完成，共保存 {saved_count} 条最近2天内的新闻")
        
    except Exception as e:
        print(f"爬取失败: {str(e)}")

if __name__ == "__main__":
    crawl_financial_news()

二、本地大模型分析代码。

python 复制代码

##连接本地deepseek
##先安装ollama模块:pip install ollama
# ollama run deepseek-coder-v2:latest
import ollama
from ollama import chat
import os
import re
from datetime import datetime, timedelta

# 定义分析规则
规则='''
帮我分析下面新闻，对明天股市有什么影响。对股市多空评分为1-10，6以上看多，5以下看空。只要一个评分，你打多少分？不要思考过程，只要评分。
'''

def extract_score_from_response(response):
    """从大模型响应中提取评分数字"""
    # 使用正则表达式匹配1-10的数字
    match = re.search(r'\b([1-9]|10)\b', response)
    if match:
        return int(match.group(1))
    return None

def analyze_news_content(content):
    """分析单篇新闻内容"""
    # 组合规则和新闻内容
    ask = 规则 + '  ' + content
    
    try:
        # 调用大模型进行分析
        stream = chat(
            model='qwen3:8b',
            messages=[{'role': 'user', 'content': ask}],
            stream=True,
        )
        
        # 收集响应内容
        response = ''
        for chunk in stream:
            response += chunk['message']['content']
        
        # 提取评分
        score = extract_score_from_response(response)
        return score, response.strip()
        
    except Exception as e:
        print(f"分析新闻时出错: {e}")
        return None, None

def process_news_directory(directory_path):
    """处理新闻目录中的所有txt文件"""
    scores = []
    file_results = []
    
    # 检查目录是否存在
    if not os.path.exists(directory_path):
        print(f"目录不存在: {directory_path}")
        return None, None
    
    # 获取所有txt文件
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    print(f"找到 {len(txt_files)} 个新闻文件")
    
    if not txt_files:
        print("目录中没有找到txt文件")
        return None, None
    
    # 处理每个文件
    for filename in txt_files:
        file_path = os.path.join(directory_path, filename)
        print(f"\n正在分析: {filename}")
        
        try:
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # 分析新闻内容
            score, response = analyze_news_content(content)
            
            if score is not None:
                scores.append(score)
                file_results.append({
                    'filename': filename,
                    'score': score,
                    'response': response
                })
                print(f"评分: {score}")
            else:
                print(f"无法提取评分，响应: {response}")
                file_results.append({
                    'filename': filename,
                    'score': None,
                    'response': response
                })
                
        except Exception as e:
            print(f"处理文件 {filename} 时出错: {e}")
            file_results.append({
                'filename': filename,
                'score': None,
                'response': f"错误: {e}"
            })
    
    return scores, file_results

def calculate_prediction(scores):
    """计算平均评分并给出多空预测"""
    if not scores:
        return None, "没有有效的评分数据"
    
    avg_score = sum(scores) / len(scores)
    
    if avg_score >= 6:
        prediction = "看多"
    elif avg_score <= 5:
        prediction = "看空"
    else:
        prediction = "中性"
    
    return avg_score, prediction

def save_results_to_file(file_results, avg_score, prediction, output_file="分析结果.txt"):
    """将分析结果保存到文件"""
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            # 只写入平均评分
            f.write(f"{avg_score:.2f}")
        
        print(f"\n分析结果已保存到: {output_file}")
        return True
    except Exception as e:
        print(f" 保存结果到文件时出错: {e}")
        return False
#微：17578755056

def main():
    """主函数"""
    print("开始分析新闻对股市的影响...")
    
    # 新闻目录路径
    news_directory = datetime.now().strftime('%Y-%m-%d')
    
    # 处理新闻目录
    scores, file_results = process_news_directory(news_directory)
    
    if scores:
        # 计算平均评分和预测
        avg_score, prediction = calculate_prediction(scores)
        
        print("\n" + "="*50)
        print("分析结果汇总:")
        print("="*50)
        
        # 显示每个文件的分析结果
        for result in file_results:
            print(f"\n文件: {result['filename']}")
            if result['score'] is not None:
                print(f"评分: {result['score']}")
            else:
                print("评分: 无法获取")
            if result['response']:
                print(f"响应: {result['response'][:100]}...")
        
        print("\n" + "="*50)
        print(f"平均评分: {avg_score:.2f}")
        print(f"多空预测: {prediction}")
        print("="*50)
        
        # 详细解释
        if prediction == "看多":
            print("\n预测说明: 多数新闻对股市有积极影响，建议关注投资机会")
        elif prediction == "看空":
            print("\n预测说明: 多数新闻对股市有负面影响，建议谨慎操作")
        else:
            print("\n预测说明: 新闻影响中性，市场可能维持震荡格局")
        
        # 保存结果到文件
        save_results_to_file(file_results, avg_score, prediction)
    else:
        print("\n未能获取有效的评分数据")

if __name__ == "__main__":
    main()

程序分析的最终结果，保存到文件【分析结果.txt】中，后面我们根据这个文件的评分，自动选择合适的交易策略。

超越自己是我的每一步！我的进步就是你的进步！