python实现fasttext

1、用开源库

python 复制代码
import fasttext

# 准备训练数据
# 数据应该是一个文本文件,其中每一行表示一个样本,每行以一个标签开头,然后是文本内容。
# 标签的格式为:__label__<your-label>,例如:__label__positive I love this movie!

train_data = 'path/to/your/training/data.txt'

# 训练模型
model = fasttext.train_supervised(train_data)

# 保存模型
model.save_model('fasttext_model.bin')

# 加载模型
model = fasttext.load_model('fasttext_model.bin')

# 使用模型进行预测
text = 'This is an example sentence.'
prediction = model.predict(text)

print(f'Text: {text}')
print(f'Prediction: {prediction}')

# 计算模型在测试数据上的精度
test_data = 'path/to/your/test/data.txt'
result = model.test(test_data)

print(f'Precision: {result[1]}')
print(f'Recall: {result[2]}')

2、用TensorFlow

python 复制代码
import tensorflow as tf
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def tokenize(text):
    return re.findall(r'\w+', text.lower())

def preprocess_data(data):
    sentences = []
    labels = []
    for line in data:
        label, text = line.split(' ', 1)
        sentences.append(tokenize(text))
        labels.append(label)
    return sentences, labels

def build_vocab(sentences, min_count=5):
    word_counts = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            word_counts[word] += 1

    vocab = {word: idx for idx, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    return vocab

def sentence_to_vector(sentence, vocab):
    vector = np.zeros(len(vocab))
    for word in sentence:
        if word in vocab:
            vector[vocab[word]] += 1
    return vector

# 示例数据
data = [
    "__label__positive I love this movie!",
    "__label__negative This movie is terrible!",
    "__label__positive This is a great film.",
    "__label__negative I didn't enjoy the movie."
]

sentences, labels = preprocess_data(data)
vocab = build_vocab(sentences)
label_encoder = LabelEncoder().fit(labels)

X = np.array([sentence_to_vector(sentence, vocab) for sentence in sentences])
y = label_encoder.transform(labels)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建模型
model = tf.keras.Sequential([
    tf.keras.layers.Dense(len(set(labels)), input_shape=(len(vocab),), activation='softmax')
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# 预测
test_sentence = "This is an amazing movie!"
prediction = model.predict(np.array([sentence_to_vector(tokenize(test_sentence), vocab)]))
predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
print(f'Text: {test_sentence}')
print(f'Prediction: {predicted_label}')

# 评估
y_pred = model.predict(X_test)
y_pred = label_encoder.inverse_transform(np.argmax(y_pred, axis=1))
y_true = label_encoder.inverse_transform(y_test)
print(classification_report(y_true, y_pred))

3、用python实现

python 复制代码
import numpy as np
import re
from collections import defaultdict
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report

def tokenize(text):
    return re.findall(r'\w+', text.lower())

def preprocess_data(data):
    sentences = []
    labels = []
    for line in data:
        label, text = line.split(' ', 1)
        sentences.append(tokenize(text))
        labels.append(label)
    return sentences, labels

def build_vocab(sentences, min_count=5):
    word_counts = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            word_counts[word] += 1

    vocab = {word: idx for idx, (word, count) in enumerate(word_counts.items()) if count >= min_count}
    return vocab

def build_label_index(labels):
    label_index = {}
    for label in labels:
        if label not in label_index:
            label_index[label] = len(label_index)
    return label_index

def sentence_to_vector(sentence, vocab):
    vector = np.zeros(len(vocab))
    for word in sentence:
        if word in vocab:
            vector[vocab[word]] += 1
    return vector

def train_fasttext(sentences, labels, vocab, label_index, lr=0.01, epochs=10):
    W = np.random.randn(len(label_index), len(vocab))
    for epoch in range(epochs):
        for sentence, label in zip(sentences, labels):
            vector = sentence_to_vector(sentence, vocab)
            scores = W.dot(vector)
            probs = np.exp(scores) / np.sum(np.exp(scores))
            target = np.zeros(len(label_index))
            target[label_index[label]] = 1
            W -= lr * np.outer(probs - target, vector)

    return W

def predict_fasttext(sentence, W, vocab, label_index):
    vector = sentence_to_vector(sentence, vocab)
    scores = W.dot(vector)
    probs = np.exp(scores) / np.sum(np.exp(scores))
    max_index = np.argmax(probs)
    return list(label_index.keys())[list(label_index.values()).index(max_index)]

# 示例数据
data = [
    "__label__positive I love this movie!",
    "__label__negative This movie is terrible!",
    "__label__positive This is a great film.",
    "__label__negative I didn't enjoy the movie."
]

sentences, labels = preprocess_data(data)
vocab = build_vocab(sentences)
label_index = build_label_index(labels)

# 训练模型
W = train_fasttext(sentences, labels, vocab, label_index)

# 预测
test_sentence = "This is an amazing movie!"
prediction = predict_fasttext(tokenize(test_sentence), W, vocab, label_index)
print(f'Text: {test_sentence}')
print(f'Prediction: {prediction}')

# 评估
y_true = labels
y_pred = [predict_fasttext(sentence, W, vocab, label_index) for sentence in sentences]
print(classification_report(y_true, y_pred))
相关推荐
databook7 小时前
Manim实现闪光轨迹特效
后端·python·动效
Juchecar8 小时前
解惑:NumPy 中 ndarray.ndim 到底是什么?
python
用户8356290780518 小时前
Python 删除 Excel 工作表中的空白行列
后端·python
Json_8 小时前
使用python-fastApi框架开发一个学校宿舍管理系统-前后端分离项目
后端·python·fastapi
数据智能老司机15 小时前
精通 Python 设计模式——分布式系统模式
python·设计模式·架构
数据智能老司机16 小时前
精通 Python 设计模式——并发与异步模式
python·设计模式·编程语言
数据智能老司机16 小时前
精通 Python 设计模式——测试模式
python·设计模式·架构
数据智能老司机16 小时前
精通 Python 设计模式——性能模式
python·设计模式·架构
c8i16 小时前
drf初步梳理
python·django
每日AI新事件16 小时前
python的异步函数
python