Apriori:这里做了个小优化,比如abcde 和adcef 自连接出的新项集abcdef ,可以用abcde 的位置和f 的位置取交集,这样第n 项集的计算可以用n-1 项集的信息和数字本身的位置信息计算出来,只需要保存第n-1项集的位置信息就可以提速
cpp
#include<iostream>
#include<bits/stdc++.h>
#include<cstring>
#include<set>
#include<map>
#include <unordered_set>
#include<string>
#include<vector>
#include<windows.h>
#include<time.h>
#define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
#define N 100000
#define MAX_RECORD_NUM 88162 + 1
#define MAX_ITEMID_NUM 16470 + 1
using namespace std;
int recordNum = 0;
int mp[MAX_ITEMID_NUM];
vector<int> items[N];
double support;
void fastRead(){
char c;
bool lastIsNum = false;
uint16_t num;
FILE* fp = fopen(DATA_NAME, "r");
while (~(c = fgetc(fp))) {
if (c >= '0' && c <= '9') {
if(lastIsNum){
num *= 10;
num += c - '0';
}
else{
num = c - '0';
}
lastIsNum = true;
}
else {
if (lastIsNum){
items[num].push_back(recordNum);
mp[num]++;
}
if (c == '\n'){
recordNum++;
}
lastIsNum = false;
}
}
if (lastIsNum) {
items[num].push_back(recordNum);
mp[num]++;
}
if (c != '\n') {
recordNum++;
}
fclose(fp);
}
vector<int> same_number(vector<int> &tmp1,vector<int> &tmp2){
int ite=0;
vector<int> res={};
for(int i=0;i<tmp1.size();++i){
while(tmp2[ite]<tmp1[i] && ite<tmp2.size()-1) ite++;
if(tmp2[ite]==tmp1[i]){
//cout<<tmp2[ite]<<"*"<<tmp1[i]<<" "<<ite<<" "<<i<<endl;
res.push_back(tmp1[i]);
}
}
return res;
}
vector<vector<int>> v;
vector<vector<int>> v2;
vector<string> s,s2;
int total;
void cal(){
total=0;
s.clear();
v.clear();
for(int i=0;i<=MAX_ITEMID_NUM;++i){
if(mp[i]>=recordNum*support*0.01){
string tri=to_string(i);
int tmp_len=tri.size();
for(int j=1;j<=5-tmp_len;j++){
tri="0"+tri;
}
s.push_back(tri);
v.push_back(items[i]);
}
}
int now_set_level=1;
while(1){
total+=s.size();
cout<<"共有"<<s.size()<<"个频繁"<<now_set_level<<"项集\n";
// for(auto t:s){cout<<t<<" ";}
// cout<<endl;
v2.clear();
s2.clear();
for(int i=0;i<s.size();++i){
for(int j=i+1;j<s.size();++j){
if(s[i].substr(0,s[i].size()-5)==s[j].substr(0,s[i].size()-5)){
int num_end=stol(s[j].substr(s[i].size()-5,5));
vector<int> tmp1=v[i];
vector<int> tmp2=items[num_end];
vector<int> same_vector=same_number(tmp1,tmp2);
if(same_vector.size()>=recordNum*support*0.01){
v2.push_back(same_vector);
string new_tmp_string=s[i]+s[j].substr(s[i].size()-5,5);
s2.push_back(new_tmp_string);
}
}
//cout<<(s[i].substr(s[i].size()-5,5))<<endl;
}
}
v=v2;
s=s2;
if(v.size()==0){
break;
}
now_set_level+=1;
}
cout<<"共有"<<total<<"个频繁项集\n";
}
signed main(){
cout<<"请输入置信度(单位%)\n";
cin>>support;
fastRead();
long starttime = GetTickCount();
cal();
long endtime = GetTickCount();
long time_cost = endtime - starttime;
cout << "timecost " << time_cost << " ms" << endl;
cout<<"请输入操作\n";
cout<<" 1:更改置信度\n";
int oper;
cin>>oper;
if(oper==1){
cout<<"请输入置信度\n";
cin>>support;
cal();
}
}
Fpgrowth的算法,我没有递归建树,只建了一次树,所以速度比完整的fpgrowth要慢(当知道还要建其他树的时候实在不知道我这屎山代码怎么在继续写下去,所以直接后面直接暴力了),建了一次树后直接拿链过去爆搜子集计数了,速度主要慢在我的链最长有10左右,fpgrowth最后剪完只有3-4,通过链获取子集的复杂度是链的长度,所以会慢,如果有一些方法能把无用节点去掉,这种做法也会快,(以后有缘再回来改吧
cpp
#include<iostream>
#include<bits/stdc++.h>
#include<cstring>
#include<set>
#include<map>
#include <unordered_set>
#include<string>
#include<vector>
#include<windows.h>
#include<time.h>
#define DATA_NAME R"(D:\Cpan\Download\.vscode\retail.dat)"
#define N 100000
#define MAX_RECORD_NUM 88162 + 1
#define MAX_ITEMID_NUM 16470 + 1
using namespace std;
int recordNum = 0;
int mp[MAX_ITEMID_NUM];
vector<int> items[N];
double support;
void fastRead(){
char c;
bool lastIsNum = false;
uint16_t num;
FILE* fp = fopen(DATA_NAME, "r");
while (~(c = fgetc(fp))) {
if (c >= '0' && c <= '9') {
if(lastIsNum){
num *= 10;
num += c - '0';
}
else{
num = c - '0';
}
lastIsNum = true;
}
else {
if (lastIsNum){
items[recordNum].push_back(num);
mp[num]++;
}
if (c == '\n'){
recordNum++;
}
lastIsNum = false;
}
}
if (lastIsNum) {
items[recordNum].push_back(num);
mp[num]++;
}
if (c != '\n') {
recordNum++;
}
fclose(fp);
}
int node_number;
vector<vector<int>> v;
vector<int> head_table[MAX_ITEMID_NUM];
int head_table_back[10*MAX_ITEMID_NUM];
vector<pair<int,int>> fp_tree[10*MAX_ITEMID_NUM];
pair<int,int> fp_tree_value[10*MAX_ITEMID_NUM];
bool cmp(int &a,int &b){
return mp[a]>mp[b];
}
void build(int son,int fa,vector<int> &value,int index){
if(index==value.size()) return;
bool exi=0;
for(auto t:fp_tree[son]){
if(t.first!=fa && fp_tree_value[t.first].first==value[index]){
fp_tree_value[t.first].second+=1;
exi=1;
build(t.first,son,value,index+1);
}
}
if(exi==0){
node_number+=1;
head_table[value[index]].push_back(node_number);
head_table_back[node_number]=value[index];
fp_tree_value[node_number]={value[index],1};
fp_tree[son].push_back({node_number,1});
fp_tree[node_number].push_back({son,-1});
build(node_number,son,value,index+1);
}
}
int tmp_dp[10*MAX_ITEMID_NUM];
void back(int number,vector<int> &res_chain){
if(number==0 && fp_tree_value[number].second==0) return ;
for(auto t:fp_tree[number]){
if(t.second==-1 && fp_tree_value[t.first].second!=0){
// cout<<"节点 "<<t.first<<" ";
// cout<<"以前是"<<tmp_dp[t.first]<<" ";
res_chain.push_back({head_table_back[t.first]});
//cout<<"现在是"<<tmp_dp[t.first]<<"\n";
back(t.first,res_chain);
}
}
}
int res[MAX_ITEMID_NUM];
vector<int> number_item;
void dfs_son(vector<vector<int>> &res_son,vector<int> &value,vector<int> tmp,int index,int now_number){
if(index==value.size()){
if(tmp.size()!=0){
res_son.push_back(tmp);
}
return ;
}
if(now_number<=3){
tmp.push_back(value[index]);
dfs_son(res_son,value,tmp,index+1,now_number+1);
tmp.pop_back();
dfs_son(res_son,value,tmp,index+1,now_number);
}
else{
dfs_son(res_son,value,tmp,index+1,now_number);
}
}
signed main(){
cout<<"请输入置信度(单位%)\n";
cin>>support;
fastRead();
for(int i=0;i<recordNum;++i){
vector<int> tmp;
for(auto t:items[i]){
if(mp[t]>=recordNum*support*0.01){
tmp.push_back(t);
}
}
if(tmp.size()==0) continue;
else{
sort(tmp.begin(),tmp.end(),cmp);
// for(auto t:tmp){cout<<t<<" ";}
// cout<<endl<<"**************************\n";
v.push_back(tmp);
}
}
// cout<<v.size()<<endl;
long starttime = GetTickCount();
for(auto t:v){
build(0,-1,t,0);
}
for(int i=0;i<MAX_ITEMID_NUM;++i){
if(mp[i]>=recordNum*support*0.01){
res[1]+=1;
number_item.push_back(i);
//cout<<i<<"*\n";
}
}
for(auto t:number_item){
for(int i=0;i<MAX_ITEMID_NUM;++i){
tmp_dp[i]=0;
}
map<vector<int>,int> map_vec;
for(auto j:head_table[t]){
//cout<<j<<"*";
vector<int> vt={};
int value=fp_tree_value[j].second;
back(j,vt);
if(!vt.size()) continue;
// vector<vector<int>> vs;
// for(int k=0;k<(1<<(vt.size()));k++){
// vector<int> resson;
// for(int o=0;o<=vt.size()-1;o++){
// if((k>>o)&1){
// resson.push_back(vt[o]);
// }
// }
// if(resson.size()){
// vs.push_back(resson);
// }
// }
// for(auto t:vs){
// map_vec[t]+=value;
// }
vector<vector<int> > res_son={};
vector<int> tmp2={};
dfs_son(res_son,vt,tmp2,0,1);
for(auto t:res_son){
map_vec[t]+=value;
}
}
for(auto t:map_vec){
if(t.second>=recordNum*support*0.01){
res[t.first.size()+1]+=1;
}
}
}
long endtime = GetTickCount();
long time_cost = endtime - starttime;
cout << "timecost " << time_cost << " ms" << endl;
int total=0;
for(int i=1;i<=MAX_ITEMID_NUM;++i){
if(res[i]!=0){
total+=res[i];
cout<<"共有"<<res[i]<<"个频繁"<<i<<"项集\n";
}
else{
cout<<"共有"<<total<<"个频繁项集\n";
break;
}
}
}