数据分析作业四-基于用户及物品数据进行内容推荐

python 复制代码

## 导入支持库
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re
import seaborn as sns

python 复制代码

## 加载数据集并检查书籍，用户和评级数据集的形状
books = pd.read_csv('F:\\data\\bleeding_data\\BX-Books.csv',
                    sep=None,encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor',
                 'yearOfPublication', 'publisher',
                 'imageUrlS', 'imageUrlM', 'imageUrlL']

users = pd.read_csv('F:\\data\\bleeding_data\\BX-Users.csv',
                    sep=None, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']


ratings = pd.read_csv('F:\\data\\bleeding_data\\BX-Book-Ratings.csv',
                      sep=None, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

print (books.shape)
print (users.shape)
print (ratings.shape)

复制代码

(271360, 8)
(278858, 3)
(1149780, 3)

python 复制代码

## 一、图书数据集
books.head()

4	0393045218	The Mummies of Urumchi	E. J. W. Barber	1999	W. W. Norton & Company	http://images.amazon.com/images/P/0393045218.0...	http://images.amazon.com/images/P/0393045218.0...	http://images.amazon.com/images/P/0393045218.0...

python 复制代码

## url不需要分析，进行删除
books.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis=1,inplace=True)
books.head()

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford | 2002 | Oxford University Press |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright | 2001 | HarperFlamingo Canada |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este | 1991 | HarperPerennial |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata | 1999 | Farrar Straus Giroux |

4	0393045218	The Mummies of Urumchi	E. J. W. Barber	1999	W. W. Norton & Company

python 复制代码

## books.dtypes
books.dtypes

复制代码

ISBN                 object
bookTitle            object
bookAuthor           object
yearOfPublication    object
publisher            object
dtype: object

python 复制代码

## 现在检查属性的唯一值
books.bookTitle.unique()

复制代码

array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'Lily Dale : The True Story of the Town that Talks to the Dead',
       "Republic (World's Classics)",
       "A Guided Tour of Rene Descartes' Meditations on First Philosophy with Complete Translations of the Meditations by Ronald Rubin"],
      dtype=object)

python 复制代码

books.yearOfPublication.unique()

复制代码

array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)

python 复制代码

books.loc[books.yearOfPublication == 'DK Publishing Inc',:]
books.yearOfPublication.unique()

复制代码

array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       '2008', '1378', '1919', '1922', '1897', '2024', '1376', '2037'],
      dtype=object)

python 复制代码

print(books.loc[books.yearOfPublication == 'DK Publishing Inc',:])

复制代码

              ISBN                                          bookTitle  \
209538  078946697X  DK Readers: Creating the X-Men, How It All Beg...   
221678  0789466953  DK Readers: Creating the X-Men, How Comic Book...   

       bookAuthor  yearOfPublication  \
209538       2000  DK Publishing Inc   
221678       2000  DK Publishing Inc   

                                                publisher  
209538  http://images.amazon.com/images/P/078946697X.0...  
221678  http://images.amazon.com/images/P/0789466953.0...

python 复制代码

books.loc[books.yearOfPublication == 'DK Publishing Inc',:]

221678	0789466953	DK Readers: Creating the X-Men, How Comic Book...	2000	DK Publishing Inc	http://images.amazon.com/images/P/0789466953.0...

python 复制代码

## 从上面可以看出，bookAuthor错误地装载了bookTitle，因此需要进行修正。
# ISBN '0789466953'
books.loc[books.ISBN == '0789466953','yearOfPublication'] = 2000
books.loc[books.ISBN == '0789466953','bookAuthor'] = "James Buckley"
books.loc[books.ISBN == '0789466953','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '0789466953','bookTitle'] = "DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)"

#ISBN '078946697X'
books.loc[books.ISBN == '078946697X','yearOfPublication'] = 2000
books.loc[books.ISBN == '078946697X','bookAuthor'] = "Michael Teitelbaum"
books.loc[books.ISBN == '078946697X','publisher'] = "DK Publishing Inc"
books.loc[books.ISBN == '078946697X','bookTitle'] = "DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)"

python 复制代码

books.loc[(books.ISBN == '0789466953') | (books.ISBN == '078946697X'),:]

221678	0789466953	DK Readers: Creating the X-Men, How Comic Book...	James Buckley	2000	DK Publishing Inc

python 复制代码

## 继续纠正出版年鉴的类型
books.yearOfPublication=pd.to_numeric(books.yearOfPublication, errors='coerce')
sorted(books['yearOfPublication'].unique())

复制代码

[0.0,
 1376.0,
 1378.0,
 1806.0,
 1897.0,
 1900.0,
 1901.0,
 1902.0,
 1904.0,
 1906.0,
 1908.0,
 1909.0,
 1910.0,
 1911.0,
 1914.0,
 1917.0,
 1919.0,
 1920.0,
 1921.0,
 1922.0,
 1923.0,
 1924.0,
 1925.0,
 1926.0,
 1927.0,
 1928.0,
 1929.0,
 1930.0,
 1931.0,
 1932.0,
 1933.0,
 1934.0,
 1935.0,
 1936.0,
 1937.0,
 1938.0,
 1939.0,
 1940.0,
 1941.0,
 1942.0,
 1943.0,
 1944.0,
 1945.0,
 1946.0,
 1947.0,
 1948.0,
 1949.0,
 1950.0,
 1951.0,
 1952.0,
 1953.0,
 1954.0,
 1955.0,
 1956.0,
 1957.0,
 1958.0,
 1959.0,
 1960.0,
 1961.0,
 1962.0,
 1963.0,
 1964.0,
 1965.0,
 1966.0,
 1967.0,
 1968.0,
 1969.0,
 1970.0,
 1971.0,
 1972.0,
 1973.0,
 1974.0,
 1975.0,
 1976.0,
 1977.0,
 1978.0,
 1979.0,
 1980.0,
 1981.0,
 1982.0,
 1983.0,
 1984.0,
 1985.0,
 1986.0,
 1987.0,
 1988.0,
 1989.0,
 1990.0,
 1991.0,
 1992.0,
 1993.0,
 1994.0,
 1995.0,
 1996.0,
 1997.0,
 1998.0,
 1999.0,
 2000.0,
 2001.0,
 2002.0,
 2003.0,
 2004.0,
 2005.0,
 2006.0,
 2008.0,
 2010.0,
 2011.0,
 2012.0,
 2020.0,
 2021.0,
 2024.0,
 2026.0,
 2030.0,
 2037.0,
 2038.0,
 2050.0,
 nan]

python 复制代码

## 现在可以看出yearOfPublication的类型为int，其值范围为0-2050。

## 由于该数据集建于2004年，我假设2006年之后的所有年份都无效，保留两年的保证金，以防数据集可能已更新。

## 对于所有无效条目（包括0），我将这些条目转换为NaN，然后用剩余年份的平均值替换它们。
books.loc[(books.yearOfPublication > 2006) | (books.yearOfPublication == 0),'yearOfPublication'] = np.NAN

python 复制代码

# 用年出版的平均价值代替NaNs在案例数据集被更新的情况下保留一定的空白
books.yearOfPublication.fillna(round(books.yearOfPublication.mean()), inplace=True)
books.yearOfPublication.isnull().sum()

复制代码

python 复制代码

books.yearOfPublication = books.yearOfPublication.astype(np.int32)

python 复制代码

## publisher
books.loc[books.publisher.isnull(),:]

129037	1931696993	Finders Keepers	Linnea Sinclair	2001	NaN

python 复制代码

## 检查行是否有书签作为查找器，看看我们是否能得到任何线索

## 与不同的出版商和图书作者的所有行
books.loc[(books.bookTitle == 'Tyrant Moon'),:]

128890	193169656X	Tyrant Moon	Elaine Corvidae	2002	NaN

python 复制代码

books.loc[(books.bookTitle == 'Finders Keepers'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 10799 | 082177364X | Finders Keepers | Fern Michaels | 2002 | Zebra Books |
| 42019 | 0070465037 | Finders Keepers | Barbara Nickolae | 1989 | McGraw-Hill Companies |
| 58264 | 0688118461 | Finders Keepers | Emily Rodda | 1993 | Harpercollins Juvenile Books |
| 66678 | 1575663236 | Finders Keepers | Fern Michaels | 1998 | Kensington Publishing Corporation |
| 129037 | 1931696993 | Finders Keepers | Linnea Sinclair | 2001 | NaN |
| 134309 | 0156309505 | Finders Keepers | Will | 1989 | Voyager Books |
| 173473 | 0973146907 | Finders Keepers | Sean M. Costello | 2002 | Red Tower Publications |
| 195885 | 0061083909 | Finders Keepers | Sharon Sala | 2003 | HarperTorch |

211874	0373261160	Finders Keepers	Elizabeth Travis	1993	Worldwide Library

python 复制代码

## 由图书作者检查以找到模式

## 都有不同的出版商。这里没有线索
books.loc[(books.bookAuthor == 'Elaine Corvidae'),:]

| | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 126762 | 1931696934 | Winter's Orphans | Elaine Corvidae | 2001 | Novelbooks |
| 128890 | 193169656X | Tyrant Moon | Elaine Corvidae | 2002 | NaN |

129001	0759901880	Wolfkin	Elaine Corvidae	2001	Hard Shell Word Factory

python 复制代码

## 由图书作者检查以找到模式
books.loc[(books.bookAuthor == 'Linnea Sinclair'),:]

129037	1931696993	Finders Keepers	Linnea Sinclair	2001	NaN

python 复制代码

## 因为没有什么共同的东西可以推断出NaNs的发布者，将它们替换为"other"
books.loc[(books.ISBN == '193169656X'),'publisher'] = 'other'
books.loc[(books.ISBN == '1931696993'),'publisher'] = 'other'

python 复制代码

## 二、用户数据集
print (users.shape)
users.head()

复制代码

(278858, 3)

| | userID | Location | Age |
| 0 | 1 | nyc, new york, usa | NaN |
| 1 | 2 | stockton, california, usa | 18.0 |
| 2 | 3 | moscow, yukon territory, russia | NaN |
| 3 | 4 | porto, v.n.gaia, portugal | 17.0 |

4	5	farnborough, hants, united kingdom	NaN

python 复制代码

users.dtypes

复制代码

userID        int64
Location     object
Age         float64
dtype: object

python 复制代码

users.userID.values

复制代码

array([     1,      2,      3, ..., 278856, 278857, 278858], dtype=int64)

python 复制代码

## Age 
sorted(users.Age.unique())

复制代码

[nan,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 113.0,
 114.0,
 115.0,
 116.0,
 118.0,
 119.0,
 123.0,
 124.0,
 127.0,
 128.0,
 132.0,
 133.0,
 136.0,
 137.0,
 138.0,
 140.0,
 141.0,
 143.0,
 146.0,
 147.0,
 148.0,
 151.0,
 152.0,
 156.0,
 157.0,
 159.0,
 162.0,
 168.0,
 172.0,
 175.0,
 183.0,
 186.0,
 189.0,
 199.0,
 200.0,
 201.0,
 204.0,
 207.0,
 208.0,
 209.0,
 210.0,
 212.0,
 219.0,
 220.0,
 223.0,
 226.0,
 228.0,
 229.0,
 230.0,
 231.0,
 237.0,
 239.0,
 244.0]

python 复制代码

## 年龄栏有一些无效的条目，比如nan，0和非常高的值，比如100和以上
users.loc[(users.Age > 90) | (users.Age < 5), 'Age'] = np.nan

python 复制代码

## 用平均值代替NaN
## 将数据类型设置为int
users.Age = users.Age.fillna(users.Age.mean())
users.Age = users.Age.astype(np.int32)

python 复制代码

sorted(users.Age.unique())

复制代码

[5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90]

python 复制代码

## 三、评级数据集
ratings.shape

复制代码

(1149780, 3)

python 复制代码

## 如果每个用户对每个条目进行评级，那么评级数据集将有nusers * nbooks条目，这表明数据集非常稀疏。
n_users = users.shape[0]
n_books = books.shape[0]
print (n_users * n_books)

复制代码

75670906880

python 复制代码

ratings.head(5)

| | userID | ISBN | bookRating |
| 0 | 276725 | 034545104X | 0 |
| 1 | 276726 | 0155061224 | 5 |
| 2 | 276727 | 0446520802 | 0 |
| 3 | 276729 | 052165615X | 3 |

4	276729	0521795028	6

python 复制代码

ratings.bookRating.unique()

复制代码

array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

python 复制代码

ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]

python 复制代码

print (ratings.shape)
print (ratings_new.shape)

复制代码

(1149780, 3)
(1031136, 3)

python 复制代码

## 没有新用户添加，因此我们将使用高于数据集的新用户（1031136，3）
print ("number of users: " + str(n_users))
print ("number of books: " + str(n_books))

复制代码

number of users: 278858
number of books: 271360

python 复制代码

sparsity=1.0-len(ratings_new)/float(n_users*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')

复制代码

图书交叉数据集的稀疏级别是 99.99863734155898 %

python 复制代码

ratings.bookRating.unique()

复制代码

array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

python 复制代码

ratings_explicit = ratings_new[ratings_new.bookRating != 0]
ratings_implicit = ratings_new[ratings_new.bookRating == 0]

python 复制代码

print (ratings_new.shape)
print( ratings_explicit.shape)
print (ratings_implicit.shape)

复制代码

(1031136, 3)
(383842, 3)
(647294, 3)

python 复制代码

## 统计
sns.countplot(data=ratings_explicit , x='bookRating')
plt.show()

python 复制代码

## 基于简单流行度的推荐系统
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['bookRating'].sum())
top10 = ratings_count.sort_values('bookRating', ascending = False).head(10)
print ("推荐下列书籍")
top10.merge(books, left_index = True, right_on = 'ISBN')

复制代码

推荐下列书籍

| | bookRating | ISBN | bookTitle | bookAuthor | yearOfPublication | publisher |
| 408 | 5787 | 0316666343 | The Lovely Bones: A Novel | Alice Sebold | 2002 | Little, Brown |
| 748 | 4108 | 0385504209 | The Da Vinci Code | Dan Brown | 2003 | Doubleday |
| 522 | 3134 | 0312195516 | The Red Tent (Bestselling Backlist) | Anita Diamant | 1998 | Picador USA |
| 2143 | 2798 | 059035342X | Harry Potter and the Sorcerer's Stone (Harry P... | J. K. Rowling | 1999 | Arthur A. Levine Books |
| 356 | 2595 | 0142001740 | The Secret Life of Bees | Sue Monk Kidd | 2003 | Penguin Books |
| 26 | 2551 | 0971880107 | Wild Animus | Rich Shapero | 2004 | Too Far |
| 1105 | 2524 | 0060928336 | Divine Secrets of the Ya-Ya Sisterhood: A Novel | Rebecca Wells | 1997 | Perennial |
| 706 | 2402 | 0446672211 | Where the Heart Is (Oprah's Book Club (Paperba... | Billie Letts | 1998 | Warner Books |
| 231 | 2219 | 0452282152 | Girl with a Pearl Earring | Tracy Chevalier | 2001 | Plume Books |

118	2179	0671027360	Angels & Demons	Dan Brown	2001	Pocket Star

python 复制代码

users_exp_ratings = users[users.userID.isin(ratings_explicit.userID)]
users_imp_ratings = users[users.userID.isin(ratings_implicit.userID)]

python 复制代码

print (users.shape)
print (users_exp_ratings.shape)
print (users_imp_ratings.shape)

复制代码

(278858, 3)
(68091, 3)
(52451, 3)

python 复制代码

## 基于协同过滤的推荐系统
counts1 = ratings_explicit['userID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['userID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['bookRating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['bookRating'].isin(counts[counts >= 100].index)]

python 复制代码

ratings_matrix = ratings_explicit.pivot(index='userID', columns='ISBN', values='bookRating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()

复制代码

(449, 66574)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2110 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2276 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |

4385	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 66574 columns

python 复制代码

n_users = ratings_matrix.shape[0] #只考虑那些给出明确评级的用户
n_books = ratings_matrix.shape[1]
print (n_users, n_books)

复制代码

449 66574

python 复制代码

ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

python 复制代码

ratings_matrix.head(5)

| ISBN | 0000913154 | 0001046438 | 000104687X | 0001047213 | 0001047973 | 000104799X | 0001048082 | 0001053736 | 0001053744 | 0001055607 | ... | B000092Q0A | B00009EF82 | B00009NDAN | B0000DYXID | B0000T6KHI | B0000VZEJQ | B0000X8HIE | B00013AX9E | B0001I1KOG | B000234N3A |
| userID | | | | | | | | | | | | | | | | | | | | | |
| 2033 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |

4385	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 66574 columns

python 复制代码

sparsity=1.0-len(ratings_explicit)/float(users_exp_ratings.shape[0]*n_books)
print ('图书交叉数据集的稀疏级别是 ' +  str(sparsity*100) + ' %')

复制代码

图书交叉数据集的稀疏级别是 99.99772184106935 %

python 复制代码

## 基于用户的协同过滤
global metric,k
k=10
metric='cosine'
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
            
    return similarities,indices

python 复制代码

def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilarusers(user_id, ratings,metric, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc,:].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #在非常稀疏的数据集的情况下，使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里的处理如下
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
 
    return prediction

python 复制代码

## 测试
predict_userbased(11676,'0001056107',ratings_matrix)

复制代码

用户预测等级 11676 -> item 0001056107: 2





2

python 复制代码

## 基于项目的协同过滤
def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
 
    return similarities,indices

python 复制代码

def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue;
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    
    #在非常稀疏的数据集的情况下，使用基于协作的方法的相关度量可能会给出负面的评价
    #在这里处理的是下面的//代码，没有下面的代码片段，下面的代码片段是为了避免负面影响
    #在使用相关度规时，可能会出现非常稀疏的数据集的预测
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
 
    print ('用户预测等级 {0} -> item {1}: {2}'.format(user_id,item_id,prediction)    )  
    
    return prediction

python 复制代码

## 测试
prediction = predict_itembased(11676,'0001056107',ratings_matrix)

复制代码

用户预测等级 11676 -> item 0001056107: 1

python 复制代码