python
import pandas as pd
import math
1.数据预处理
python
docA = "The cat sat on my face"
docB = "The dog sat on my bed"
wordsA = docA.split(" ")
wordsB = docB.split(" ")
wordsSet = set(wordsA).union(set(wordsB))
print(wordsSet)
{'on', 'my', 'face', 'sat', 'dog', 'The', 'cat', 'bed'}
2.计算词的频数
python
wordCountA = dict.fromkeys(wordsSet, 0)
wordCountB = dict.fromkeys(wordsSet, 0)
for word in wordsA:
wordCountA[word] += 1
for word in wordsB:
wordCountB[word] += 1
pd.DataFrame([wordCountA, wordCountB])
| | on | my | face | sat | dog | The | cat | bed |
| 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 |
1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 |
---|
3.计算词的频率
python
def computeTF(wordCount, docWords):
tfDict = {}
docCount = len(docWords)
for word, count in wordCount.items():
tfDict[word] = count / float(docCount)
return tfDict
tfA = computeTF(wordCountA, wordsA)
tfB = computeTF(wordCountB, wordsB)
print("tfA ", tfA)
tfA {'on': 0.16666666666666666, 'my': 0.16666666666666666, 'face': 0.16666666666666666, 'sat': 0.16666666666666666, 'dog': 0.0, 'The': 0.16666666666666666, 'cat': 0.16666666666666666, 'bed': 0.0}
4.计算逆文档频率
python
def computeIDF(docList):
idfDict = {}
doc_len = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, count in doc.items():
if count > 0:
idfDict[word] += 1
for word, count in idfDict.items():
idfDict[word] = math.log10((doc_len + 1) / float(count + 1))
return idfDict
idf = computeIDF([wordCountA, wordCountB])
print(idf)
{'on': 0.0, 'my': 0.0, 'face': 0.17609125905568124, 'sat': 0.0, 'dog': 0.17609125905568124, 'The': 0.0, 'cat': 0.17609125905568124, 'bed': 0.17609125905568124}
5.计算 TF-IDF
python
def computeTFIDF(tf, idf):
tfidf = {}
for word, tf in tf.items():
tfidf[word] = tf * idf[word]
return tfidf
tfidfA = computeTFIDF(tfA, idf)
tfidfB = computeTFIDF(tfB, idf)
pd.DataFrame([tfidfA, tfidfB])
| | on | my | face | sat | dog | The | cat | bed |
| 0 | 0.0 | 0.0 | 0.029349 | 0.0 | 0.000000 | 0.0 | 0.029349 | 0.000000 |
1 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.029349 | 0.0 | 0.000000 | 0.029349 |
---|