数据分析作业四-基于用户及物品数据进行内容推荐

python 复制代码
## 导入支持库
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re
import seaborn as sns
python 复制代码
## 加载数据集并检查书籍,用户和评级数据集的形状
books = pd.read_csv('F:\\data\\bleeding_data\\BX-Books.csv',
                    sep=None,encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor',
                 'yearOfPublication', 'publisher',
                 'imageUrlS', 'imageUrlM', 'imageUrlL']

users = pd.read_csv('F:\\data\\bleeding_data\\BX-Users.csv',
                    sep=None, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']


ratings = pd.read_csv('F:\\data\\bleeding_data\\BX-Book-Ratings.csv',
                      sep=None, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

print (books.shape)
print (users.shape)
print (ratings.shape)
复制代码
(271360, 8)
(278858, 3)
(1149780, 3)
python 复制代码
## 一、图书数据集
books.head()

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher | imageUrlS | imageUrlM | imageUrlL |
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata | 1999 | Farrar Straus Giroux | http://images.amazon.com/images/P/0374157065.0... | http://images.amazon.com/images/P/0374157065.0... | http://images.amazon.com/images/P/0374157065.0... |

4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0...
python 复制代码
## url不需要分析,进行删除
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)
books.head()

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata | 1999 | Farrar Straus Giroux |

4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company
python 复制代码
## books.dtypes
books.dtypes
复制代码
ISBN                 object
bookTitle            object
bookAuthor           object
yearOfPublication    object
publisher            object
dtype: object
python 复制代码
## 现在检查属性的唯一值
books.bookTitle.unique()
复制代码
array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'Lily Dale : The True Story of the Town that Talks to the Dead',
       "Republic (World's Classics)",
       "A Guided Tour of Rene Descartes' Meditations on First Philosophy with Complete Translations of the Meditations by Ronald Rubin"],
      dtype=object)
python 复制代码
books.yearOfPublication.unique()
复制代码
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)
python 复制代码
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
books.yearOfPublication.unique()
复制代码
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)
python 复制代码
print(books.loc[books.yearOfPublication == 'DK Publishing Inc',:])
复制代码
              ISBN                                          bookTitle  \
209538  078946697X  DK Readers: Creating the X-Men, How It All Beg...   
221678  0789466953  DK Readers: Creating the X-Men, How Comic Book...   

       bookAuthor  yearOfPublication  \
209538       2000  DK Publishing Inc   
221678       2000  DK Publishing Inc   

                                                publisher  
209538  http://images.amazon.com/images/P/078946697X.0...  
221678  http://images.amazon.com/images/P/0789466953.0...  
python 复制代码
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 209538 | 078946697X | DK Readers: Creating the X-Men, How It All Beg... | 2000 | DK Publishing Inc | http://images.amazon.com/images/P/078946697X.0... |

221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... 2000 DK Publishing Inc http://images.amazon.com/images/P/0789466953.0...
python 复制代码
## 从上面可以看出,bookAuthor错误地装载了bookTitle,因此需要进行修正。
# ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"

#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"
python 复制代码
books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 209538 | 078946697X | DK Readers: Creating the X-Men, How It All Beg... | Michael Teitelbaum | 2000 | DK Publishing Inc |

221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... James Buckley 2000 DK Publishing Inc
python 复制代码
## 继续纠正出版年鉴的类型
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')
sorted(books['yearOfPublication'].unique())
复制代码
[0.0,
 1376.0,
 1378.0,
 1806.0,
 1897.0,
 1900.0,
 1901.0,
 1902.0,
 1904.0,
 1906.0,
 1908.0,
 1909.0,
 1910.0,
 1911.0,
 1914.0,
 1917.0,
 1919.0,
 1920.0,
 1921.0,
 1922.0,
 1923.0,
 1924.0,
 1925.0,
 1926.0,
 1927.0,
 1928.0,
 1929.0,
 1930.0,
 1931.0,
 1932.0,
 1933.0,
 1934.0,
 1935.0,
 1936.0,
 1937.0,
 1938.0,
 1939.0,
 1940.0,
 1941.0,
 1942.0,
 1943.0,
 1944.0,
 1945.0,
 1946.0,
 1947.0,
 1948.0,
 1949.0,
 1950.0,
 1951.0,
 1952.0,
 1953.0,
 1954.0,
 1955.0,
 1956.0,
 1957.0,
 1958.0,
 1959.0,
 1960.0,
 1961.0,
 1962.0,
 1963.0,
 1964.0,
 1965.0,
 1966.0,
 1967.0,
 1968.0,
 1969.0,
 1970.0,
 1971.0,
 1972.0,
 1973.0,
 1974.0,
 1975.0,
 1976.0,
 1977.0,
 1978.0,
 1979.0,
 1980.0,
 1981.0,
 1982.0,
 1983.0,
 1984.0,
 1985.0,
 1986.0,
 1987.0,
 1988.0,
 1989.0,
 1990.0,
 1991.0,
 1992.0,
 1993.0,
 1994.0,
 1995.0,
 1996.0,
 1997.0,
 1998.0,
 1999.0,
 2000.0,
 2001.0,
 2002.0,
 2003.0,
 2004.0,
 2005.0,
 2006.0,
 2008.0,
 2010.0,
 2011.0,
 2012.0,
 2020.0,
 2021.0,
 2024.0,
 2026.0,
 2030.0,
 2037.0,
 2038.0,
 2050.0,
 nan]
python 复制代码
## 现在可以看出yearOfPublication的类型为int,其值范围为0-2050。

## 由于该数据集建于2004年,我假设2006年之后的所有年份都无效,保留两年的保证金,以防数据集可能已更新。

## 对于所有无效条目(包括0),我将这些条目转换为NaN,然后​​用剩余年份的平均值替换它们。
books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN
python 复制代码
# 用年出版的平均价值代替NaNs在案例数据集被更新的情况下保留一定的空白
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)
books.yearOfPublication.isnull().sum()
复制代码
0
python 复制代码
books.yearOfPublication = books.yearOfPublication.astype(np.int32)
python 复制代码
## publisher
books.loc[books.publisher.isnull(),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 128890 | 193169656X | Tyrant Moon | Elaine Corvidae | 2002 | NaN |

129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
python 复制代码
## 检查行是否有书签作为查找器,看看我们是否能得到任何线索

## 与不同的出版商和图书作者的所有行
books.loc[(books.bookTitle == 'Tyrant Moon'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |

128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN
python 复制代码
books.loc[(books.bookTitle == 'Finders Keepers'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 10799 | 082177364X | Finders Keepers | Fern Michaels | 2002 | Zebra Books |
| 42019 | 0070465037 | Finders Keepers | Barbara Nickolae | 1989 | McGraw-Hill Companies |
| 58264 | 0688118461 | Finders Keepers | Emily Rodda | 1993 | Harpercollins Juvenile Books |
| 66678 | 1575663236 | Finders Keepers | Fern Michaels | 1998 | Kensington Publishing Corporation |
| 129037 | 1931696993 | Finders Keepers | Linnea Sinclair | 2001 | NaN |
| 134309 | 0156309505 | Finders Keepers | Will | 1989 | Voyager Books |
| 173473 | 0973146907 | Finders Keepers | Sean M. Costello | 2002 | Red Tower Publications |
| 195885 | 0061083909 | Finders Keepers | Sharon Sala | 2003 | HarperTorch |

211874 0373261160 Finders Keepers Elizabeth Travis 1993 Worldwide Library
python 复制代码
## 由图书作者检查以找到模式

## 都有不同的出版商。这里没有线索
books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 126762 | 1931696934 | Winter's Orphans | Elaine Corvidae | 2001 | Novelbooks |
| 128890 | 193169656X | Tyrant Moon | Elaine Corvidae | 2002 | NaN |

129001 0759901880 Wolfkin Elaine Corvidae 2001 Hard Shell Word Factory
python 复制代码
## 由图书作者检查以找到模式
books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |

129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
python 复制代码
## 因为没有什么共同的东西可以推断出NaNs的发布者,将它们替换为"other"
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'
python 复制代码
## 二、用户数据集
print (users.shape)
users.head()
复制代码
(278858, 3)

| | userID | Location | Age |
| 0 | 1 | nyc, new york, usa | NaN |
| 1 | 2 | stockton, california, usa | 18.0 |
| 2 | 3 | moscow, yukon territory, russia | NaN |
| 3 | 4 | porto, v.n.gaia, portugal | 17.0 |

4 5 farnborough, hants, united kingdom NaN
python 复制代码
users.dtypes
复制代码
userID        int64
Location     object
Age         float64
dtype: object
python 复制代码
users.userID.values
复制代码
array([     1,      2,      3, ..., 278856, 278857, 278858], dtype=int64)
python 复制代码
## Age 
sorted(users.Age.unique())
复制代码
[nan,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 113.0,
 114.0,
 115.0,
 116.0,
 118.0,
 119.0,
 123.0,
 124.0,
 127.0,
 128.0,
 132.0,
 133.0,
 136.0,
 137.0,
 138.0,
 140.0,
 141.0,
 143.0,
 146.0,
 147.0,
 148.0,
 151.0,
 152.0,
 156.0,
 157.0,
 159.0,
 162.0,
 168.0,
 172.0,
 175.0,
 183.0,
 186.0,
 189.0,
 199.0,
 200.0,
 201.0,
 204.0,
 207.0,
 208.0,
 209.0,
 210.0,
 212.0,
 219.0,
 220.0,
 223.0,
 226.0,
 228.0,
 229.0,
 230.0,
 231.0,
 237.0,
 239.0,
 244.0]
python 复制代码
## 年龄栏有一些无效的条目,比如nan,0和非常高的值,比如100和以上
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan
python 复制代码
## 用平均值代替NaN
## 将数据类型设置为int
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)
python 复制代码
sorted(users.Age.unique())
复制代码
[5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90]
python 复制代码
## 三、评级数据集
ratings.shape
复制代码
(1149780, 3)
python 复制代码
## 如果每个用户对每个条目进行评级,那么评级数据集将有nusers * nbooks条目,这表明数据集非常稀疏。
n_users = users.shape[0]
n_books = books.shape[0]
print (n_users * n_books)
复制代码
75670906880
python 复制代码
ratings.head(5)

| | userID | ISBN | bookRating |
| 0 | 276725 | 034545104X | 0 |
| 1 | 276726 | 0155061224 | 5 |
| 2 | 276727 | 0446520802 | 0 |
| 3 | 276729 | 052165615X | 3 |

4 276729 0521795028 6
python 复制代码
ratings.bookRating.unique()
复制代码
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
python 复制代码
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
python 复制代码
print (ratings.shape)
print (ratings_new.shape)
复制代码
(1149780, 3)
(1031136, 3)
python 复制代码
## 没有新用户添加,因此我们将使用高于数据集的新用户(1031136,3)
print ("number of users: " + str(n_users))
print ("number of books: " + str(n_books))
复制代码
number of users: 278858
number of books: 271360
python 复制代码
sparsity=1.0-len(ratings_new)/float(n_users*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
复制代码
图书交叉数据集的稀疏级别是 99.99863734155898 %
python 复制代码
ratings.bookRating.unique()
复制代码
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
python 复制代码
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]
python 复制代码
print (ratings_new.shape)
print( ratings_explicit.shape)
print (ratings_implicit.shape)
复制代码
(1031136, 3)
(383842, 3)
(647294, 3)
python 复制代码
## 统计
sns.countplot(data=ratings_explicit , x='bookRating')
plt.show()
python 复制代码
## 基于简单流行度的推荐系统
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)
print ("推荐下列书籍")
top10.merge(books, left_index = True, right_on = 'ISBN')
复制代码
推荐下列书籍

| | bookRating | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 408 | 5787 | 0316666343 | The Lovely Bones: A Novel | Alice Sebold | 2002 | Little, Brown |
| 748 | 4108 | 0385504209 | The Da Vinci Code | Dan Brown | 2003 | Doubleday |
| 522 | 3134 | 0312195516 | The Red Tent (Bestselling Backlist) | Anita Diamant | 1998 | Picador USA |
| 2143 | 2798 | 059035342X | Harry Potter and the Sorcerer's Stone (Harry P... | J. K. Rowling | 1999 | Arthur A. Levine Books |
| 356 | 2595 | 0142001740 | The Secret Life of Bees | Sue Monk Kidd | 2003 | Penguin Books |
| 26 | 2551 | 0971880107 | Wild Animus | Rich Shapero | 2004 | Too Far |
| 1105 | 2524 | 0060928336 | Divine Secrets of the Ya-Ya Sisterhood: A Novel | Rebecca Wells | 1997 | Perennial |
| 706 | 2402 | 0446672211 | Where the Heart Is (Oprah's Book Club (Paperba... | Billie Letts | 1998 | Warner Books |
| 231 | 2219 | 0452282152 | Girl with a Pearl Earring | Tracy Chevalier | 2001 | Plume Books |

118 2179 0671027360 Angels &amp; Demons Dan Brown 2001 Pocket Star
python 复制代码
users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]
python 复制代码
print (users.shape)
print (users_exp_ratings.shape)
print (users_imp_ratings.shape)
复制代码
(278858, 3)
(68091, 3)
(52451, 3)
python 复制代码
## 基于协同过滤的推荐系统
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]
python 复制代码
ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()
复制代码
(449, 66574)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2110 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2276 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |

4385 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 66574 columns

python 复制代码
n_users = ratings_matrix.shape[0] #只考虑那些给出明确评级的用户
n_books = ratings_matrix.shape[1]
print (n_users, n_books)
复制代码
449 66574
python 复制代码
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)
python 复制代码
ratings_matrix.head(5)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |

4385 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 66574 columns

python 复制代码
sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
复制代码
图书交叉数据集的稀疏级别是 99.99772184106935 %
python 复制代码
## 基于用户的协同过滤
global metric,k
k=10
metric='cosine'
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
            
    return similarities,indices
python 复制代码
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里的处理如下
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
 
    return prediction
python 复制代码
## 测试
predict_userbased(11676,'0001056107',ratings_matrix)
复制代码
用户预测等级 11676 -> item 0001056107: 2





2
python 复制代码
## 基于项目的协同过滤
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
 
    return similarities,indices
python 复制代码
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue;
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    
    #在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里处理的是下面的//代码,没有下面的代码片段,下面的代码片段是为了避免负面影响
    #在使用相关度规时,可能会出现非常稀疏的数据集的预测
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
 
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction)    )  
    
    return prediction
python 复制代码
## 测试
prediction = predict_itembased(11676,'0001056107',ratings_matrix)
复制代码
用户预测等级 11676 -> item 0001056107: 1
python 复制代码
相关推荐
我要学习别拦我~1 天前
读《精益数据分析》:用户行为热力图
经验分享·数据分析
smilejingwei1 天前
数据分析编程第六步:大数据运算
java·大数据·开发语言·数据分析·编程·esprocspl
luofeiju1 天前
直线拟合方法全景解析:最小二乘、正交回归与 RANSAC
人工智能·线性代数·算法·机器学习·数据挖掘·回归
喂完待续2 天前
【Big Data】AI赋能的ClickHouse 2.0:从JIT编译到LLM查询优化,下一代OLAP引擎进化路径
大数据·数据库·clickhouse·数据分析·olap·big data·序列晋升
c_hn_0072 天前
可转换公司债Level-2高频交易五档Tick级分钟历史数据分析指南
数据挖掘·数据分析·#etf五档行情快照·#大宗商品期货行情·#港股十档订单薄历史数据·#深度订单簿数据
大神薯条老师2 天前
Python从入门到高手9.4节-基于字典树的敏感词识别算法
爬虫·python·深度学习·机器学习·数据分析
淦暴尼3 天前
每日五个pyecharts可视化图表日历图和箱线图:从入门到精通
信息可视化·数据挖掘·数据分析
程序猿小D3 天前
【完整源码+数据集+部署教程】硬币分类与识别系统源码和数据集:改进yolo11-SWC
人工智能·yolo·计算机视觉·数据挖掘·数据集·yolo11·硬币分类与识别系统
武汉格发Gofartlic3 天前
HFSS许可证状态查询与管理
大数据·运维·人工智能·数据分析·自动化
大翻哥哥4 天前
Python地理空间数据分析:从地图绘制到智能城市应用
开发语言·python·数据分析