对音频切分成小音频(机器学习用)

我是把so-vits中小工具,分析源码然后提取出来了。以后可以写在自己的程序里。

-------流程(这是我做的流程,你可以不用看)

从开源代码中快速获取自己需要的东西

如果有界面f12看他里面的接口,然后在源码中全局搜索,没有接口比如socket,看他的消息字段,然后推测。然后提取补齐代码就行了


你需要看的

提取出来有3个类

run.py是我自己写的

其他是我提取的源码,首先你得install一些包

numpy,librosa,soundfile

slicer2.py

python 复制代码
import numpy as np


# This function is obtained from librosa.
def get_rms(
    y,
    *,
    frame_length=2048,
    hop_length=512,
    pad_mode="constant",
):
    padding = (int(frame_length // 2), int(frame_length // 2))
    y = np.pad(y, padding, mode=pad_mode)

    axis = -1
    # put our new within-frame axis at the end for now
    out_strides = y.strides + tuple([y.strides[axis]])
    # Reduce the shape on the framing axis
    x_shape_trimmed = list(y.shape)
    x_shape_trimmed[axis] -= frame_length - 1
    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
    xw = np.lib.stride_tricks.as_strided(
        y, shape=out_shape, strides=out_strides
    )
    if axis < 0:
        target_axis = axis - 1
    else:
        target_axis = axis + 1
    xw = np.moveaxis(xw, -1, target_axis)
    # Downsample along the target axis
    slices = [slice(None)] * xw.ndim
    slices[axis] = slice(0, None, hop_length)
    x = xw[tuple(slices)]

    # Calculate power
    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)

    return np.sqrt(power)


class Slicer:
    def __init__(self,
                 sr: int,
                 threshold: float = -40.,
                 min_length: int = 5000,
                 min_interval: int = 300,
                 hop_size: int = 20,
                 max_sil_kept: int = 5000):
        if not min_length >= min_interval >= hop_size:
            raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
        if not max_sil_kept >= hop_size:
            raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
        min_interval = sr * min_interval / 1000
        self.threshold = 10 ** (threshold / 20.)
        self.hop_size = round(sr * hop_size / 1000)
        self.win_size = min(round(min_interval), 4 * self.hop_size)
        self.min_length = round(sr * min_length / 1000 / self.hop_size)
        self.min_interval = round(min_interval / self.hop_size)
        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)

    def _apply_slice(self, waveform, begin, end):
        if len(waveform.shape) > 1:
            return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
        else:
            return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]

    # @timeit
    def slice(self, waveform):
        if len(waveform.shape) > 1:
            samples = waveform.mean(axis=0)
        else:
            samples = waveform
        if samples.shape[0] <= self.min_length:
            return [waveform]
        rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
        sil_tags = []
        silence_start = None
        clip_start = 0
        for i, rms in enumerate(rms_list):
            # Keep looping while frame is silent.
            if rms < self.threshold:
                # Record start of silent frames.
                if silence_start is None:
                    silence_start = i
                continue
            # Keep looping while frame is not silent and silence start has not been recorded.
            if silence_start is None:
                continue
            # Clear recorded silence start if interval is not enough or clip is too short
            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
            need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
            if not is_leading_silence and not need_slice_middle:
                silence_start = None
                continue
            # Need slicing. Record the range of silent frames to be removed.
            if i - silence_start <= self.max_sil_kept:
                pos = rms_list[silence_start: i + 1].argmin() + silence_start
                if silence_start == 0:
                    sil_tags.append((0, pos))
                else:
                    sil_tags.append((pos, pos))
                clip_start = pos
            elif i - silence_start <= self.max_sil_kept * 2:
                pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
                pos += i - self.max_sil_kept
                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
                if silence_start == 0:
                    sil_tags.append((0, pos_r))
                    clip_start = pos_r
                else:
                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
                    clip_start = max(pos_r, pos)
            else:
                pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
                pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
                if silence_start == 0:
                    sil_tags.append((0, pos_r))
                else:
                    sil_tags.append((pos_l, pos_r))
                clip_start = pos_r
            silence_start = None
        # Deal with trailing silence.
        total_frames = rms_list.shape[0]
        if silence_start is not None and total_frames - silence_start >= self.min_interval:
            silence_end = min(total_frames, silence_start + self.max_sil_kept)
            pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
            sil_tags.append((pos, total_frames + 1))
        # Apply and return slices.
        if len(sil_tags) == 0:
            return [waveform]
        else:
            chunks = []
            if sil_tags[0][0] > 0:
                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
            for i in range(len(sil_tags) - 1):
                chunks.append(self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]))
            if sil_tags[-1][1] < total_frames:
                chunks.append(self._apply_slice(waveform, sil_tags[-1][1], total_frames))
            return chunks


def main():
    import os.path
    from argparse import ArgumentParser

    import librosa
    import soundfile

    parser = ArgumentParser()
    parser.add_argument('audio', type=str, help='The audio to be sliced')
    parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
    parser.add_argument('--db_thresh', type=float, required=False, default=-40,
                        help='The dB threshold for silence detection')
    parser.add_argument('--min_length', type=int, required=False, default=5000,
                        help='The minimum milliseconds required for each sliced audio clip')
    parser.add_argument('--min_interval', type=int, required=False, default=300,
                        help='The minimum milliseconds for a silence part to be sliced')
    parser.add_argument('--hop_size', type=int, required=False, default=10,
                        help='Frame length in milliseconds')
    parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
                        help='The maximum silence length kept around the sliced clip, presented in milliseconds')
    args = parser.parse_args()
    out = args.out
    if out is None:
        out = os.path.dirname(os.path.abspath(args.audio))
    audio, sr = librosa.load(args.audio, sr=None, mono=False)
    slicer = Slicer(
        sr=sr,
        threshold=args.db_thresh,
        min_length=args.min_length,
        min_interval=args.min_interval,
        hop_size=args.hop_size,
        max_sil_kept=args.max_sil_kept
    )
    chunks = slicer.slice(audio)
    if not os.path.exists(out):
        os.makedirs(out)
    for i, chunk in enumerate(chunks):
        if len(chunk.shape) > 1:
            chunk = chunk.T
        soundfile.write(os.path.join(out, f'%s_%d.wav' % (os.path.basename(args.audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)


if __name__ == '__main__':
    main()

auto_slicer.py

python 复制代码
import os
import numpy as np
import librosa
import soundfile as sf
from slicer2 import Slicer

class AutoSlicer:
    def __init__(self):
        self.slicer_params = {
            "threshold": -40,
            "min_length": 5000,
            "min_interval": 300,
            "hop_size": 10,
            "max_sil_kept": 500,
        }
        self.original_min_interval = self.slicer_params["min_interval"]

    def auto_slice(self, filename, input_dir, output_dir, max_sec):
        audio, sr = librosa.load(os.path.join(input_dir, filename), sr=None, mono=False)
        slicer = Slicer(sr=sr, **self.slicer_params)
        chunks = slicer.slice(audio)
        files_to_delete = []
        for i, chunk in enumerate(chunks):
            if len(chunk.shape) > 1:
                chunk = chunk.T
            output_filename = f"{os.path.splitext(filename)[0]}_{i}"
            output_filename = "".join(c for c in output_filename if c.isascii() or c == "_") + ".wav"
            output_filepath = os.path.join(output_dir, output_filename)
            sf.write(output_filepath, chunk, sr)
            #Check and re-slice audio that more than max_sec.
            while True:
                new_audio, sr = librosa.load(output_filepath, sr=None, mono=False)
                if librosa.get_duration(y=new_audio, sr=sr) <= max_sec:
                    break
                self.slicer_params["min_interval"] = self.slicer_params["min_interval"] // 2
                if self.slicer_params["min_interval"] >= self.slicer_params["hop_size"]:
                    new_chunks = Slicer(sr=sr, **self.slicer_params).slice(new_audio)
                    for j, new_chunk in enumerate(new_chunks):
                        if len(new_chunk.shape) > 1:
                            new_chunk = new_chunk.T
                        new_output_filename = f"{os.path.splitext(output_filename)[0]}_{j}.wav"
                        sf.write(os.path.join(output_dir, new_output_filename), new_chunk, sr)
                    files_to_delete.append(output_filepath)
                else:
                    break
            self.slicer_params["min_interval"] = self.original_min_interval
        for file_path in files_to_delete:
            if os.path.exists(file_path):
                os.remove(file_path)

    def merge_short(self, output_dir, max_sec, min_sec):
        short_files = []
        for filename in os.listdir(output_dir):
            filepath = os.path.join(output_dir, filename)
            if filename.endswith(".wav"):
                audio, sr = librosa.load(filepath, sr=None, mono=False)
                duration = librosa.get_duration(y=audio, sr=sr)
                if duration < min_sec:
                    short_files.append((filepath, audio, duration))
        short_files.sort(key=lambda x: x[2], reverse=True)
        merged_audio = []
        current_duration = 0
        for filepath, audio, duration in short_files:
            if current_duration + duration <= max_sec:
                merged_audio.append(audio)
                current_duration += duration
                os.remove(filepath)
            else:
                if merged_audio:
                    output_audio = np.concatenate(merged_audio, axis=-1)
                    if len(output_audio.shape) > 1:
                        output_audio = output_audio.T
                    output_filename = f"merged_{len(os.listdir(output_dir))}.wav"
                    sf.write(os.path.join(output_dir, output_filename), output_audio, sr)
                    merged_audio = [audio]
                    current_duration = duration
                    os.remove(filepath)
        if merged_audio and current_duration >= min_sec:
            output_audio = np.concatenate(merged_audio, axis=-1)
            if len(output_audio.shape) > 1:
                output_audio = output_audio.T
            output_filename = f"merged_{len(os.listdir(output_dir))}.wav"
            sf.write(os.path.join(output_dir, output_filename), output_audio, sr)
    
    def slice_count(self, input_dir, output_dir):
        orig_duration = final_duration = 0
        for file in os.listdir(input_dir):
            if file.endswith(".wav"):
                _audio, _sr = librosa.load(os.path.join(input_dir, file), sr=None, mono=False)
                orig_duration += librosa.get_duration(y=_audio, sr=_sr)
        wav_files = [file for file in os.listdir(output_dir) if file.endswith(".wav")]
        num_files = len(wav_files)
        max_duration = -1
        min_duration = float("inf")
        for file in wav_files:
            file_path = os.path.join(output_dir, file)
            audio, sr = librosa.load(file_path, sr=None, mono=False)
            duration = librosa.get_duration(y=audio, sr=sr)
            final_duration += float(duration)
            if duration > max_duration:
                max_duration = float(duration)
            if duration < min_duration:
                min_duration = float(duration)
        return num_files, max_duration, min_duration, orig_duration, final_duration

run.py

python 复制代码
import os
from auto_slicer import AutoSlicer
import librosa
def slicer_fn(input_dir, output_dir, process_method, max_sec, min_sec):
    if output_dir == "":
        return "请先选择输出的文件夹"
    if output_dir == input_dir:
        return "输出目录不能和输入目录相同"
    slicer = AutoSlicer()
    if os.path.exists(output_dir) is not True:
        os.makedirs(output_dir)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".wav"):
            slicer.auto_slice(filename, input_dir, output_dir, max_sec)
    if process_method == "丢弃":
        for filename in os.listdir(output_dir):
            if filename.endswith(".wav"):
                filepath = os.path.join(output_dir, filename)
                audio, sr = librosa.load(filepath, sr=None, mono=False)
                if librosa.get_duration(y=audio, sr=sr) < min_sec:
                    os.remove(filepath)
    elif process_method == "将过短音频整合为长音频":
        slicer.merge_short(output_dir, max_sec, min_sec)
    file_count, max_duration, min_duration, orig_duration, final_duration = slicer.slice_count(input_dir, output_dir)
    hrs = int(final_duration / 3600)
    mins = int((final_duration % 3600) / 60)
    sec = format(float(final_duration % 60), '.2f')
    rate = format(100 * (final_duration / orig_duration), '.2f') if orig_duration != 0 else 0
    rate_msg = f"为原始音频时长的{rate}%" if rate != 0 else "因未知问题,无法计算切片时长的占比"
    return f"成功将音频切分为{file_count}条片段,其中最长{max_duration}秒,最短{min_duration}秒,切片后的音频总时长{hrs:02d}小时{mins:02d}分{sec}秒,{rate_msg}"

input_dir="F:\sliper\input"#输入文件夹(这里面可以放多个wav)
output_dir="F:\sliper\output"#输出文件夹
process_method="丢弃"#如果音频小于3就丢弃
max_sec=15#音频最长为15
min_sec=3#音频最小为3
slicer_fn(input_dir,output_dir,process_method,max_sec,min_sec)

测试输入

得到

相关推荐
NAGNIP9 小时前
一文搞懂深度学习中的通用逼近定理!
人工智能·算法·面试
冬奇Lab10 小时前
一天一个开源项目(第36篇):EverMemOS - 跨 LLM 与平台的长时记忆 OS,让 Agent 会记忆更会推理
人工智能·开源·资讯
冬奇Lab10 小时前
OpenClaw 源码深度解析(一):Gateway——为什么需要一个"中枢"
人工智能·开源·源码阅读
AngelPP14 小时前
OpenClaw 架构深度解析:如何把 AI 助手搬到你的个人设备上
人工智能
宅小年14 小时前
Claude Code 换成了Kimi K2.5后,我再也回不去了
人工智能·ai编程·claude
九狼14 小时前
Flutter URL Scheme 跨平台跳转
人工智能·flutter·github
ZFSS14 小时前
Kimi Chat Completion API 申请及使用
前端·人工智能
天翼云开发者社区16 小时前
春节复工福利就位!天翼云息壤2500万Tokens免费送,全品类大模型一键畅玩!
人工智能·算力服务·息壤
知识浅谈16 小时前
教你如何用 Gemini 将课本图片一键转为精美 PPT
人工智能
Ray Liang16 小时前
被低估的量化版模型,小身材也能干大事
人工智能·ai·ai助手·mindx