ai飞卢小说自动化处理工作流获得提问素材

说明：从小说网站爬取内容，获得ai提问素材，训练思维

step1:

bash 复制代码

digraph NovelProcessingFlow {
    // 设置全局样式
    node [shape=rectangle, style=filled, fillcolor="#e6f7ff", fontname="SimSun"];
    edge [arrowhead=vee, color="#1890ff"];

    // 定义节点
    获取ID列表 [label="1. 从小说网站获取小说ID列表"];
    保存章节 [label="2. 对每个小说ID，保存每章内容为本地TXT文件（如：chapter_1.txt）"];
    合并文件 [label="3. 合并所有TXT文件，生成完整小说TXT（如：full_novel.txt）"];

    // 定义流程顺序
    获取ID列表 -> 保存章节;
    保存章节 -> 合并文件;
}

step2:

python 复制代码

import requests
from bs4 import BeautifulSoup
import re
import sys


def get_book_ids_from_single_page(url):
    """
    从指定的飞卢书库页面URL中提取当前页面的所有书籍ID，并打印出来。
    不进行自动翻页，只处理当前页面。
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
    }
    book_ids = []

    print(f"开始从页面抓取: {url}")

    try:
        # 1. 发送HTTP请求
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # 检查请求是否成功
        response.encoding = 'utf-8'  # 显式设置编码

        # 2. 解析HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # 3. 在小说列表区域查找所有书籍链接
        # 书籍链接通常类似于 //wap.faloo.com/1521856.html
        novel_list = soup.find('ul', class_='novelList')
        if not novel_list:
            print("未找到小说列表区域，尝试在整个页面查找链接模式。")
            # 备用方案：在整个页面查找符合模式的链接
            all_links = soup.find_all('a', href=True)
        else:
            all_links = novel_list.find_all('a', href=True)

        # 4. 使用正则表达式提取书籍ID
        # 模式匹配 //wap.faloo.com/数字.html
        pattern = re.compile(r'//wap\.faloo\.com/(\d+)\.html')

        for link in all_links:
            href = link.get('href', '')
            match = pattern.search(href)
            if match:
                book_id = match.group(1)
                if book_id not in book_ids:  # 避免重复
                    book_ids.append(book_id)

        # 5. 对书籍ID进行排序（按数值从小到大）
        book_ids.sort(key=int)

        # 6. 打印当前页找到的书籍ID
        if book_ids:
            print(f"\n当前页面找到的书籍ID ({len(book_ids)}个，已按数值排序):")
            for i, book_id in enumerate(book_ids, 1):
                print(f"  {i:2d}. {book_id}")
        else:
            print("未找到任何书籍ID。")

    except requests.exceptions.RequestException as e:
        print(f"请求页面时出错: {e}")
    except Exception as e:
        print(f"解析页面时发生错误: {e}")

    return book_ids


if __name__ == "__main__":
    # 用户提供的URL
    page_url = "https://wap.faloo.com/y_0_0_0_0_9_0_5.html"

    print("飞卢小说网当前页面书籍ID抓取脚本")
    print("=" * 50)

    # 执行抓取
    book_ids = get_book_ids_from_single_page(page_url)

step3:

python 复制代码

import os
import time
import random
import requests
from bs4 import BeautifulSoup
from typing import Optional
from requests.exceptions import RequestException


def generate_filename(index: int, start_chapter: int = 101) -> str:
    """
    生成保存文件的名称

    参数：
    index: 当前页码相对于起始页码的偏移量（从0开始）
    start_chapter: 起始章节编号，默认为101

    返回：格式化的文件名，如chapter_101.txt
    """
    chapter_num = start_chapter + index
    return f"chapter_{chapter_num:03d}.txt"


def fetch_with_retry(
        url: str,
        headers: dict,
        max_retries: int = 3,
        base_delay: float = 1.0,
        timeout: int = 10
) -> Optional[requests.Response]:
    """
    带重试机制的请求函数

    参数：
    url: 请求URL
    headers: 请求头
    max_retries: 最大重试次数
    base_delay: 基础延迟时间（秒）
    timeout: 请求超时时间

    返回：Response对象或None
    """
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()
            return response
        except RequestException as e:
            if attempt < max_retries - 1:
                # 指数退避 + 随机抖动
                delay = base_delay * (2 ** attempt) + random.uniform(0, 0.5)
                print(f"第{attempt + 1}次重试失败，{delay:.2f}秒后重试: {str(e)}")
                time.sleep(delay)
            else:
                print(f"请求失败，已达最大重试次数({max_retries}次): {str(e)}")
                return None
    return None


def save_novel_content(
        start_page: int,
        end_page: int,
        novel_id: int,
        save_dir: str,
        start_chapter: int = 101,
        max_retries: int = 3,
        request_delay: float = 1.5,
        base_delay: float = 1.0
) -> None:
    """
    保存小说内容到本地

    参数：
    start_page: 起始页码
    end_page: 结束页码（包含）
    novel_id: 小说ID
    save_dir: 保存路径
    start_chapter: 起始章节编号，默认从101开始
    max_retries: 最大重试次数
    request_delay: 请求之间的延迟（秒），避免对目标网站造成压力
    base_delay: 重试基础延迟（秒）
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    success_count = 0
    fail_count = 0

    for i in range(start_page, end_page + 1):
        url = f"https://wap.faloo.com/{novel_id}_{i}.html"
        index = i - start_page
        filename = generate_filename(index, start_chapter)
        filepath = os.path.join(save_dir, filename)

        print(f"正在爬取第{i}页: {url}")

        # 使用重试机制获取网页内容
        response = fetch_with_retry(
            url=url,
            headers=headers,
            max_retries=max_retries,
            base_delay=base_delay
        )

        if response is None:
            print(f"第{i}页爬取失败，跳过此页")
            fail_count += 1
            continue

        try:
            # 尝试多种编码
            for encoding in ['gbk', 'gb2312', 'utf-8']:
                try:
                    response.encoding = encoding
                    soup = BeautifulSoup(response.text, 'html.parser')
                    break
                except UnicodeDecodeError:
                    continue

            # 提取原始HTML内容（保留标签）
            content_div = soup.find('div', class_='nodeContent')
            if content_div:
                content = str(content_div)
            else:
                # 尝试其他可能的类名
                content_div = soup.find('div', class_='content') or soup.find('div', {'id': 'content'})
                content = str(content_div) if content_div else ""

            if not content.strip():
                print(f"警告：第{i}页未找到有效内容")

            # 保存到文件
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)

            print(f"已保存第{i}页内容到：{filename}")
            success_count += 1

        except Exception as e:
            print(f"处理第{i}页内容时出错: {str(e)}")
            fail_count += 1
        finally:
            # 添加请求延迟，避免对目标网站造成过大压力
            if i < end_page:
                delay_time = request_delay + random.uniform(0, 0.3)  # 添加随机抖动
                print(f"等待{delay_time:.2f}秒后继续...")
                time.sleep(delay_time)

    # 输出统计信息
    print(f"\n爬取完成！")
    print(f"成功: {success_count} 页")
    print(f"失败: {fail_count} 页")
    print(f"成功率: {success_count / (end_page - start_page + 1) * 100:.1f}%")


if __name__ == "__main__":
    try:
        save_path = r"D:\Users\wangrusheng\Downloads\dce7"
        novel_id = 1521567
        start_page = 1
        end_page = 15
        start_chapter = 101

        # 新增参数
        max_retries = 3  # 最大重试次数
        request_delay = 2.0  # 请求间隔延迟（秒），避免对目标网站造成过大压力
        base_delay = 1.0  # 重试基础延迟（秒）

        print("开始爬取小说内容...")
        save_novel_content(
            start_page=start_page,
            end_page=end_page,
            novel_id=novel_id,
            save_dir=save_path,
            start_chapter=start_chapter,
            max_retries=max_retries,
            request_delay=request_delay,
            base_delay=base_delay
        )

    except KeyboardInterrupt:
        print("\n用户中断爬取程序")
    except Exception as e:
        print(f"爬取程序执行失败: {str(e)}")

step4:

python 复制代码

import os
import re
import glob



def generate_filename2(start_pages):
    """
    生成保存文件的名称
    参数：
    index: 当前页码相对于起始页码的偏移量（从0开始）
    start_chapter: 起始章节编号，默认为101
    返回：格式化的文件名，如chapter_101.txt
    """
    if start_pages==1:
        input_dir = r'D:\Users\wangrusheng\Downloads\dce'
        output_file = r'D:\Users\wangrusheng\Downloads\dce\merged.txt'

        # 如果目录不存在，创建目录
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        count = merge_txt_files(input_dir, output_file)
        print(f"成功合并 {count} 个文件。")
        print(f"合并后的文件保存在: {output_file}")

    elif start_pages==2:
        # 使用您提供的示例路径
        input_dir = r'D:\Users\wangrusheng\Downloads\dce2'
        output_file = r'D:\Users\wangrusheng\Downloads\dce2\merged.txt'

        # 如果目录不存在，创建目录
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        count = merge_txt_files(input_dir, output_file)
        print(f"成功合并 {count} 个文件。")
        print(f"合并后的文件保存在: {output_file}")

    elif start_pages==3:
        # 使用您提供的示例路径
        input_dir = r'D:\Users\wangrusheng\Downloads\dce3'
        output_file = r'D:\Users\wangrusheng\Downloads\dce3\merged.txt'

        # 如果目录不存在，创建目录
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        count = merge_txt_files(input_dir, output_file)
        print(f"成功合并 {count} 个文件。")
        print(f"合并后的文件保存在: {output_file}")

    elif start_pages==4:
        # 使用您提供的示例路径
        input_dir = r'D:\Users\wangrusheng\Downloads\dce4'
        output_file = r'D:\Users\wangrusheng\Downloads\dce4\merged.txt'

        # 如果目录不存在，创建目录
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        count = merge_txt_files(input_dir, output_file)
        print(f"成功合并 {count} 个文件。")
        print(f"合并后的文件保存在: {output_file}")
    elif start_pages==5:
        # 使用您提供的示例路径
        input_dir = r'D:\Users\wangrusheng\Downloads\dce5'
        output_file = r'D:\Users\wangrusheng\Downloads\dce5\merged.txt'

        # 如果目录不存在，创建目录
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        count = merge_txt_files(input_dir, output_file)
        print(f"成功合并 {count} 个文件。")
        print(f"合并后的文件保存在: {output_file}")

    print(f"处理第测试{start_pages}")





def merge_txt_files(input_dir, output_file):
    """
    将指定目录下所有符合 chapter_数字.txt 命名规则的txt文件按数字顺序合并到新文件。

    :param input_dir: 输入目录路径
    :param output_file: 合并后的输出文件路径
    :return: 合并的文件数量
    """
    # 获取目录下所有txt文件
    all_files = glob.glob(os.path.join(input_dir, "*.txt"))

    # 匹配 chapter_数字.txt 格式的文件名
    pattern = re.compile(r'^chapter_(\d+)\.txt$', re.IGNORECASE)
    matched_files = []

    for filepath in all_files:
        filename = os.path.basename(filepath)
        match = pattern.match(filename)
        if match:
            # 提取数字部分用于排序
            chapter_num = int(match.group(1))
            matched_files.append((chapter_num, filename, filepath))

    # 按章节数字顺序排序
    matched_files.sort(key=lambda x: x[0])

    # 合并文件
    merged_count = 0
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for chapter_num, filename, filepath in matched_files:
            try:
                with open(filepath, 'r', encoding='utf-8') as infile:
                    content = infile.read()
                    # 可选：添加文件分隔标记
                    outfile.write(f"=== 第{chapter_num}章 ===\n")
                    outfile.write(content)
                    outfile.write('\n\n')  # 文件间添加空行
                    merged_count += 1
                    print(f"已合并: {filename}")
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")

    return merged_count


# 使用示例
if __name__ == "__main__":
    # 使用您提供的示例路径
    generate_filename2(1)

step5:

python 复制代码

import os
import math

def split_file_into_parts(file_path, num_parts=3, remark=""):
    """
    将文件按固定份数分割成多个小文件

    :param file_path: 原始文件路径
    :param num_parts: 分割份数，默认3份
    :param remark: 要添加到每个分割文件末尾的备注字符串。注意：对于二进制文件（非文本），添加文本备注会损坏文件。请谨慎使用！
    """
    # 如果传入了备注，发出警告
    if remark:
        print(f"警告：您正在尝试向二进制分割文件添加文本备注'{remark[:20]}...'。这极有可能导致生成的文件损坏！")

    # 获取文件名和目录
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    dir_path = os.path.dirname(file_path) or '.'  # 处理当前目录的情况
    ext = os.path.splitext(file_path)[1]  # 保留原扩展名

    # 获取文件大小
    file_size = os.path.getsize(file_path)
    print(f"原文件大小: {file_size} 字节 ({file_size / 1024:.2f} KB)")

    # 计算每部分的大小（向上取整，确保最后一部分包含剩余内容）
    part_size = math.ceil(file_size / num_parts)
    print(f"每部分大小: {part_size} 字节 ({part_size / 1024:.2f} KB)")

    # 打开原始文件（二进制模式）
    with open(file_path, 'rb') as f:
        for part_num in range(1, num_parts + 1):
            # 生成新文件名
            output_file = os.path.join(dir_path, f"{base_name}_part{part_num}{ext}")

            # 如果是最后一部分，读取剩余所有内容
            if part_num == num_parts:
                chunk = f.read()  # 读取剩余所有内容
            else:
                # 读取指定大小的内容
                chunk = f.read(part_size)

            # 写入新文件
            with open(output_file, 'wb') as out_file:
                out_file.write(chunk)
                # ！！！危险操作：为二进制块添加文本备注（会破坏文件）
                if remark:
                    out_file.write(f"\n{remark}".encode('utf-8'))

            actual_size = len(chunk)
            print(f"已创建: {output_file} (大小: {actual_size} 字节, {actual_size / 1024:.2f} KB)")

    print(f"文件已成功分割成 {num_parts} 个部分！")


def split_text_file_into_parts(file_path, num_parts=3, encoding='utf-8', remark=""):
    """
    将文本文件按固定份数分割成多个小文件（避免截断字符）

    :param file_path: 原始文件路径
    :param num_parts: 分割份数，默认3份
    :param encoding: 文件编码，默认utf-8
    :param remark: 要添加到每个分割文件末尾的备注字符串（可选）。
    """
    # 获取文件名和目录
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    dir_path = os.path.dirname(file_path) or '.'  # 处理当前目录的情况
    ext = os.path.splitext(file_path)[1]  # 保留原扩展名

    # 读取整个文件内容
    with open(file_path, 'r', encoding=encoding) as f:
        content = f.read()

    file_size = len(content.encode(encoding))
    print(f"原文件大小: {file_size} 字节 ({file_size / 1024:.2f} KB)")
    print(f"总字符数: {len(content)}")

    # 计算每部分的字符数（向上取整）
    chars_per_part = math.ceil(len(content) / num_parts)
    print(f"每部分字符数: {chars_per_part}")

    for part_num in range(1, num_parts + 1):
        # 计算当前部分的起始和结束位置
        start_idx = (part_num - 1) * chars_per_part
        end_idx = min(part_num * chars_per_part, len(content))

        # 截取当前部分的内容
        chunk = content[start_idx:end_idx]

        # 生成新文件名
        output_file = os.path.join(dir_path, f"{base_name}_part{part_num}{ext}")

        # 写入新文件，并在末尾添加备注（如果提供了的话）
        with open(output_file, 'w', encoding=encoding) as out_file:
            out_file.write(chunk)
            if remark:  # 如果备注字符串非空，则添加
                # 可以选择添加换行符分隔，避免与原内容连在一起
                out_file.write(f"\n{remark}")

        actual_size = len(chunk.encode(encoding))
        actual_chars = len(chunk)
        print(f"已创建: {output_file} (大小: {actual_size} 字节, 字符数: {actual_chars})")

    print(f"文件已成功分割成 {num_parts} 个部分！")



# 执行分割操作
if __name__ == "__main__":
    file_path = r"D:\Users\wangrusheng\Downloads\dce5\merged.txt"




    print("\n=== 方法2: 文本分割（适合文本文件，避免乱码） ===")
    # 示例：添加一条备注
    my_remark = "请ai模拟《战情室》,主持人康(永恒的诘问者)，社博士a=上述材料构建的社会中的上层 vs 社会学博士b=上述材料构建的社会中下层,不低于1500字,忽略神秘学,聚焦社会实然和唯物主义"

    split_text_file_into_parts(file_path, num_parts=3, encoding='utf-8', remark=my_remark)

end