数据分析作业四-基于用户及物品数据进行内容推荐

python 复制代码
## 导入支持库
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re
import seaborn as sns
python 复制代码
## 加载数据集并检查书籍,用户和评级数据集的形状
books = pd.read_csv('F:\\data\\bleeding_data\\BX-Books.csv',
                    sep=None,encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor',
                 'yearOfPublication', 'publisher',
                 'imageUrlS', 'imageUrlM', 'imageUrlL']

users = pd.read_csv('F:\\data\\bleeding_data\\BX-Users.csv',
                    sep=None, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']


ratings = pd.read_csv('F:\\data\\bleeding_data\\BX-Book-Ratings.csv',
                      sep=None, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

print (books.shape)
print (users.shape)
print (ratings.shape)
复制代码
(271360, 8)
(278858, 3)
(1149780, 3)
python 复制代码
## 一、图书数据集
books.head()

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher | imageUrlS | imageUrlM | imageUrlL |
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... | http://images.amazon.com/images/P/0195153448.0... |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... | http://images.amazon.com/images/P/0002005018.0... |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... | http://images.amazon.com/images/P/0060973129.0... |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata | 1999 | Farrar Straus Giroux | http://images.amazon.com/images/P/0374157065.0... | http://images.amazon.com/images/P/0374157065.0... | http://images.amazon.com/images/P/0374157065.0... |

4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0... http://images.amazon.com/images/P/0393045218.0...
python 复制代码
## url不需要分析,进行删除
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)
books.head()

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata | 1999 | Farrar Straus Giroux |

4 0393045218 The Mummies of Urumchi E. J. W. Barber 1999 W. W. Norton & Company
python 复制代码
## books.dtypes
books.dtypes
复制代码
ISBN                 object
bookTitle            object
bookAuthor           object
yearOfPublication    object
publisher            object
dtype: object
python 复制代码
## 现在检查属性的唯一值
books.bookTitle.unique()
复制代码
array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'Lily Dale : The True Story of the Town that Talks to the Dead',
       "Republic (World's Classics)",
       "A Guided Tour of Rene Descartes' Meditations on First Philosophy with Complete Translations of the Meditations by Ronald Rubin"],
      dtype=object)
python 复制代码
books.yearOfPublication.unique()
复制代码
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)
python 复制代码
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
books.yearOfPublication.unique()
复制代码
array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)
python 复制代码
print(books.loc[books.yearOfPublication == 'DK Publishing Inc',:])
复制代码
              ISBN                                          bookTitle  \
209538  078946697X  DK Readers: Creating the X-Men, How It All Beg...   
221678  0789466953  DK Readers: Creating the X-Men, How Comic Book...   

       bookAuthor  yearOfPublication  \
209538       2000  DK Publishing Inc   
221678       2000  DK Publishing Inc   

                                                publisher  
209538  http://images.amazon.com/images/P/078946697X.0...  
221678  http://images.amazon.com/images/P/0789466953.0...  
python 复制代码
books.loc[books.yearOfPublication == 'DK Publishing Inc',:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 209538 | 078946697X | DK Readers: Creating the X-Men, How It All Beg... | 2000 | DK Publishing Inc | http://images.amazon.com/images/P/078946697X.0... |

221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... 2000 DK Publishing Inc http://images.amazon.com/images/P/0789466953.0...
python 复制代码
## 从上面可以看出,bookAuthor错误地装载了bookTitle,因此需要进行修正。
# ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"

#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"
python 复制代码
books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 209538 | 078946697X | DK Readers: Creating the X-Men, How It All Beg... | Michael Teitelbaum | 2000 | DK Publishing Inc |

221678 0789466953 DK Readers: Creating the X-Men, How Comic Book... James Buckley 2000 DK Publishing Inc
python 复制代码
## 继续纠正出版年鉴的类型
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')
sorted(books['yearOfPublication'].unique())
复制代码
[0.0,
 1376.0,
 1378.0,
 1806.0,
 1897.0,
 1900.0,
 1901.0,
 1902.0,
 1904.0,
 1906.0,
 1908.0,
 1909.0,
 1910.0,
 1911.0,
 1914.0,
 1917.0,
 1919.0,
 1920.0,
 1921.0,
 1922.0,
 1923.0,
 1924.0,
 1925.0,
 1926.0,
 1927.0,
 1928.0,
 1929.0,
 1930.0,
 1931.0,
 1932.0,
 1933.0,
 1934.0,
 1935.0,
 1936.0,
 1937.0,
 1938.0,
 1939.0,
 1940.0,
 1941.0,
 1942.0,
 1943.0,
 1944.0,
 1945.0,
 1946.0,
 1947.0,
 1948.0,
 1949.0,
 1950.0,
 1951.0,
 1952.0,
 1953.0,
 1954.0,
 1955.0,
 1956.0,
 1957.0,
 1958.0,
 1959.0,
 1960.0,
 1961.0,
 1962.0,
 1963.0,
 1964.0,
 1965.0,
 1966.0,
 1967.0,
 1968.0,
 1969.0,
 1970.0,
 1971.0,
 1972.0,
 1973.0,
 1974.0,
 1975.0,
 1976.0,
 1977.0,
 1978.0,
 1979.0,
 1980.0,
 1981.0,
 1982.0,
 1983.0,
 1984.0,
 1985.0,
 1986.0,
 1987.0,
 1988.0,
 1989.0,
 1990.0,
 1991.0,
 1992.0,
 1993.0,
 1994.0,
 1995.0,
 1996.0,
 1997.0,
 1998.0,
 1999.0,
 2000.0,
 2001.0,
 2002.0,
 2003.0,
 2004.0,
 2005.0,
 2006.0,
 2008.0,
 2010.0,
 2011.0,
 2012.0,
 2020.0,
 2021.0,
 2024.0,
 2026.0,
 2030.0,
 2037.0,
 2038.0,
 2050.0,
 nan]
python 复制代码
## 现在可以看出yearOfPublication的类型为int,其值范围为0-2050。

## 由于该数据集建于2004年,我假设2006年之后的所有年份都无效,保留两年的保证金,以防数据集可能已更新。

## 对于所有无效条目(包括0),我将这些条目转换为NaN,然后​​用剩余年份的平均值替换它们。
books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN
python 复制代码
# 用年出版的平均价值代替NaNs在案例数据集被更新的情况下保留一定的空白
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)
books.yearOfPublication.isnull().sum()
复制代码
0
python 复制代码
books.yearOfPublication = books.yearOfPublication.astype(np.int32)
python 复制代码
## publisher
books.loc[books.publisher.isnull(),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 128890 | 193169656X | Tyrant Moon | Elaine Corvidae | 2002 | NaN |

129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
python 复制代码
## 检查行是否有书签作为查找器,看看我们是否能得到任何线索

## 与不同的出版商和图书作者的所有行
books.loc[(books.bookTitle == 'Tyrant Moon'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |

128890 193169656X Tyrant Moon Elaine Corvidae 2002 NaN
python 复制代码
books.loc[(books.bookTitle == 'Finders Keepers'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 10799 | 082177364X | Finders Keepers | Fern Michaels | 2002 | Zebra Books |
| 42019 | 0070465037 | Finders Keepers | Barbara Nickolae | 1989 | McGraw-Hill Companies |
| 58264 | 0688118461 | Finders Keepers | Emily Rodda | 1993 | Harpercollins Juvenile Books |
| 66678 | 1575663236 | Finders Keepers | Fern Michaels | 1998 | Kensington Publishing Corporation |
| 129037 | 1931696993 | Finders Keepers | Linnea Sinclair | 2001 | NaN |
| 134309 | 0156309505 | Finders Keepers | Will | 1989 | Voyager Books |
| 173473 | 0973146907 | Finders Keepers | Sean M. Costello | 2002 | Red Tower Publications |
| 195885 | 0061083909 | Finders Keepers | Sharon Sala | 2003 | HarperTorch |

211874 0373261160 Finders Keepers Elizabeth Travis 1993 Worldwide Library
python 复制代码
## 由图书作者检查以找到模式

## 都有不同的出版商。这里没有线索
books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 126762 | 1931696934 | Winter's Orphans | Elaine Corvidae | 2001 | Novelbooks |
| 128890 | 193169656X | Tyrant Moon | Elaine Corvidae | 2002 | NaN |

129001 0759901880 Wolfkin Elaine Corvidae 2001 Hard Shell Word Factory
python 复制代码
## 由图书作者检查以找到模式
books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |

129037 1931696993 Finders Keepers Linnea Sinclair 2001 NaN
python 复制代码
## 因为没有什么共同的东西可以推断出NaNs的发布者,将它们替换为"other"
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'
python 复制代码
## 二、用户数据集
print (users.shape)
users.head()
复制代码
(278858, 3)

| | userID | Location | Age |
| 0 | 1 | nyc, new york, usa | NaN |
| 1 | 2 | stockton, california, usa | 18.0 |
| 2 | 3 | moscow, yukon territory, russia | NaN |
| 3 | 4 | porto, v.n.gaia, portugal | 17.0 |

4 5 farnborough, hants, united kingdom NaN
python 复制代码
users.dtypes
复制代码
userID        int64
Location     object
Age         float64
dtype: object
python 复制代码
users.userID.values
复制代码
array([     1,      2,      3, ..., 278856, 278857, 278858], dtype=int64)
python 复制代码
## Age 
sorted(users.Age.unique())
复制代码
[nan,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 113.0,
 114.0,
 115.0,
 116.0,
 118.0,
 119.0,
 123.0,
 124.0,
 127.0,
 128.0,
 132.0,
 133.0,
 136.0,
 137.0,
 138.0,
 140.0,
 141.0,
 143.0,
 146.0,
 147.0,
 148.0,
 151.0,
 152.0,
 156.0,
 157.0,
 159.0,
 162.0,
 168.0,
 172.0,
 175.0,
 183.0,
 186.0,
 189.0,
 199.0,
 200.0,
 201.0,
 204.0,
 207.0,
 208.0,
 209.0,
 210.0,
 212.0,
 219.0,
 220.0,
 223.0,
 226.0,
 228.0,
 229.0,
 230.0,
 231.0,
 237.0,
 239.0,
 244.0]
python 复制代码
## 年龄栏有一些无效的条目,比如nan,0和非常高的值,比如100和以上
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan
python 复制代码
## 用平均值代替NaN
## 将数据类型设置为int
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)
python 复制代码
sorted(users.Age.unique())
复制代码
[5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90]
python 复制代码
## 三、评级数据集
ratings.shape
复制代码
(1149780, 3)
python 复制代码
## 如果每个用户对每个条目进行评级,那么评级数据集将有nusers * nbooks条目,这表明数据集非常稀疏。
n_users = users.shape[0]
n_books = books.shape[0]
print (n_users * n_books)
复制代码
75670906880
python 复制代码
ratings.head(5)

| | userID | ISBN | bookRating |
| 0 | 276725 | 034545104X | 0 |
| 1 | 276726 | 0155061224 | 5 |
| 2 | 276727 | 0446520802 | 0 |
| 3 | 276729 | 052165615X | 3 |

4 276729 0521795028 6
python 复制代码
ratings.bookRating.unique()
复制代码
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
python 复制代码
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]
python 复制代码
print (ratings.shape)
print (ratings_new.shape)
复制代码
(1149780, 3)
(1031136, 3)
python 复制代码
## 没有新用户添加,因此我们将使用高于数据集的新用户(1031136,3)
print ("number of users: " + str(n_users))
print ("number of books: " + str(n_books))
复制代码
number of users: 278858
number of books: 271360
python 复制代码
sparsity=1.0-len(ratings_new)/float(n_users*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
复制代码
图书交叉数据集的稀疏级别是 99.99863734155898 %
python 复制代码
ratings.bookRating.unique()
复制代码
array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)
python 复制代码
ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]
python 复制代码
print (ratings_new.shape)
print( ratings_explicit.shape)
print (ratings_implicit.shape)
复制代码
(1031136, 3)
(383842, 3)
(647294, 3)
python 复制代码
## 统计
sns.countplot(data=ratings_explicit , x='bookRating')
plt.show()
python 复制代码
## 基于简单流行度的推荐系统
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)
print ("推荐下列书籍")
top10.merge(books, left_index = True, right_on = 'ISBN')
复制代码
推荐下列书籍

| | bookRating | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 408 | 5787 | 0316666343 | The Lovely Bones: A Novel | Alice Sebold | 2002 | Little, Brown |
| 748 | 4108 | 0385504209 | The Da Vinci Code | Dan Brown | 2003 | Doubleday |
| 522 | 3134 | 0312195516 | The Red Tent (Bestselling Backlist) | Anita Diamant | 1998 | Picador USA |
| 2143 | 2798 | 059035342X | Harry Potter and the Sorcerer's Stone (Harry P... | J. K. Rowling | 1999 | Arthur A. Levine Books |
| 356 | 2595 | 0142001740 | The Secret Life of Bees | Sue Monk Kidd | 2003 | Penguin Books |
| 26 | 2551 | 0971880107 | Wild Animus | Rich Shapero | 2004 | Too Far |
| 1105 | 2524 | 0060928336 | Divine Secrets of the Ya-Ya Sisterhood: A Novel | Rebecca Wells | 1997 | Perennial |
| 706 | 2402 | 0446672211 | Where the Heart Is (Oprah's Book Club (Paperba... | Billie Letts | 1998 | Warner Books |
| 231 | 2219 | 0452282152 | Girl with a Pearl Earring | Tracy Chevalier | 2001 | Plume Books |

118 2179 0671027360 Angels &amp; Demons Dan Brown 2001 Pocket Star
python 复制代码
users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]
python 复制代码
print (users.shape)
print (users_exp_ratings.shape)
print (users_imp_ratings.shape)
复制代码
(278858, 3)
(68091, 3)
(52451, 3)
python 复制代码
## 基于协同过滤的推荐系统
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]
python 复制代码
ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()
复制代码
(449, 66574)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2110 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2276 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |

4385 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 66574 columns

python 复制代码
n_users = ratings_matrix.shape[0] #只考虑那些给出明确评级的用户
n_books = ratings_matrix.shape[1]
print (n_users, n_books)
复制代码
449 66574
python 复制代码
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)
python 复制代码
ratings_matrix.head(5)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |

4385 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 66574 columns

python 复制代码
sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')
复制代码
图书交叉数据集的稀疏级别是 99.99772184106935 %
python 复制代码
## 基于用户的协同过滤
global metric,k
k=10
metric='cosine'
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
            
    return similarities,indices
python 复制代码
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里的处理如下
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
 
    return prediction
python 复制代码
## 测试
predict_userbased(11676,'0001056107',ratings_matrix)
复制代码
用户预测等级 11676 -> item 0001056107: 2





2
python 复制代码
## 基于项目的协同过滤
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
 
    return similarities,indices
python 复制代码
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue;
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    
    #在非常稀疏的数据集的情况下,使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里处理的是下面的//代码,没有下面的代码片段,下面的代码片段是为了避免负面影响
    #在使用相关度规时,可能会出现非常稀疏的数据集的预测
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
 
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction)    )  
    
    return prediction
python 复制代码
## 测试
prediction = predict_itembased(11676,'0001056107',ratings_matrix)
复制代码
用户预测等级 11676 -> item 0001056107: 1
python 复制代码
相关推荐
api_180079054601 小时前
请求、认证与响应数据解析:1688 商品 API 接口深度探秘
java·大数据·开发语言·mysql·数据挖掘
hweiyu0010 小时前
Hive 技术深度解析与 P7 数据分析架构师多行业全场景实战课程合集(视频教程)
hive·数据分析
sensen_kiss15 小时前
INT303 Big Data Analysis 大数据分析 Pt.3 数据挖掘(Data Mining)
大数据·数据挖掘·数据分析
B站计算机毕业设计之家16 小时前
Python招聘数据分析可视化系统 Boss直聘数据 selenium爬虫 Flask框架 数据清洗(附源码)✅
爬虫·python·selenium·机器学习·数据分析·flask
雪碧聊技术16 小时前
爬虫是什么?
大数据·爬虫·python·数据分析
算法与编程之美17 小时前
探索不同的优化器对分类精度的影响和卷积层的输入输出的shape的计算公式
人工智能·深度学习·机器学习·分类·数据挖掘
没有梦想的咸鱼185-1037-166318 小时前
【生命周期评价(LCA)】基于OpenLCA、GREET、R语言的生命周期评价方法、模型构建
开发语言·数据分析·r语言
TwoAnts&DingJoy1 天前
数据分析-数据沙箱
人工智能·python·安全·数据分析·数据沙箱
Hello.Reader1 天前
用 Spark Shell 做交互式数据分析从入门到自包含应用
大数据·数据分析·spark
Bony-1 天前
生活方式与肥胖风险:多维度数据分析与预测模型研究
数据挖掘·数据分析·生活