//NaiveBayes.h
#ifndef NAIVEBAYES_H_
#define NAIVEBAYES_H_
#include<iostream>
#include<map>
#include<set>
#include<cmath>
#include<vector>
#include<algorithm>
#include<numeric>
#include<cstring>
#include<stdio.h>
#include<cstdlib>
using namespace std;
class NaiveBayes
{
private:
vector< vector<string> > list_of_posts;
vector<int> list_classes;
map<string, int> my_vocab_list;
int *return_vec;
vector< vector<int> > train_mat;
vector<float> p0vect;
vector<float> p1vect;
float p_abusive;
public:
NaiveBayes();
void create_vocab_list();//create_vocab_list
void set_of_words_to_vec(int idx);
void get_train_matrix();
void print();
void train_NB0();
int classify_NB(string *doc_to_classify);
};
#endif // !NAIVEBAYES_H_
//NaiveBayes.cpp
#include"stdafx.h"
#include"NaiveBayes.h"
string posting_list[6][10] =
{
{ "my", "dog", "has", "flea", "problems", "help", "please", "null" },
{ "maybe", "not", "take", "him", "to", "dog", "park", "stupid", "null" },
{ "my", "dalmation", "is", "so", "cute", "I", "love", "him", "null" },
{ "stop", "posting", "stupid", "worthless", "garbage", "null" },
{ "mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him", "null" },
{ "quit", "buying", "worthless", "dog", "food", "stupid", "null" }
};
int class_vec[6] = { 0, 1, 0, 1, 0, 1 };//1 is abusive ,0 not
NaiveBayes::NaiveBayes()
{
vector<string> vec;
for (int i = 0; i<6; i++)
{
vec.clear();
for (int j = 0; posting_list[i][j] != "null"; j++)
{
vec.push_back(posting_list[i][j]);
}
list_of_posts.push_back(vec);
}
for (int i = 0; i<sizeof(class_vec) / sizeof(class_vec[0]); i++)
{
list_classes.push_back(class_vec[i]);
}
}
void NaiveBayes::create_vocab_list()
{
vector< vector<string> > ::iterator it = list_of_posts.begin();
int index = 1;
while (it != list_of_posts.end())
{
//vector<string> vec( *it.begin(),*it.end() );
vector<string> vec = *it;
vector<string> ::iterator tmp_it = vec.begin();
while (tmp_it != vec.end())
{
//cout<<*tmp_it<<" ";
if (my_vocab_list[*tmp_it] == 0)
{
my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary
}
tmp_it++;
}
it++;
}
}//create_vocab_list
//set some one word to vec with 0 and 1.
void NaiveBayes::set_of_words_to_vec(int idx)
{
cout << "set of words to vec begin the document id is : " << idx << endl;
int len = my_vocab_list.size() + 1;
return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.
fill(return_vec, return_vec + len, 0);
for (int i = 0; i<len; i++)
cout << return_vec[i] << " ";
for (int i = 0; posting_list[idx][i] != "null"; i++)
{
//cout<<posting_list[idx][i]<<" ";
int pos = my_vocab_list[posting_list[idx][i]];
if (pos != 0)
{
return_vec[pos] = 1;
}
}
cout << endl;
}//set_of_words_to_vec
void NaiveBayes::get_train_matrix()
{
cout << "get train matrix begin : " << endl;
train_mat.clear();
for (int i = 0; i<6; i++)
{
set_of_words_to_vec(i);
vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);
train_mat.push_back(vec);
delete[]return_vec;
}
}//get train matrix
void NaiveBayes::print()
{
cout << "print the train matrix begin : " << endl;
vector< vector<int> > ::iterator it = train_mat.begin();
while (it != train_mat.end())
{
vector<int> vec = *it;
vector<int> ::iterator itt = vec.begin();
while (itt != vec.end())
{
cout << *itt << " ";
itt++;
}
cout << endl;
it++;
}
}//print()
void NaiveBayes::train_NB0()
{
int num_train_docs = train_mat.size();//sizeof(posting_lists)/sizeof(posting_lists[0]);
cout << "num_train_docs = " << num_train_docs << endl;
int num_words = train_mat[0].size() - 1;
/* calculatr the sum of the abusive classes */
int sum = accumulate(list_classes.begin(), list_classes.end(), 0);
cout << "sum = " << sum << endl;
//float p_abusive = (float)sum/(float)num_train_docs;
p_abusive = (float)sum / (float)num_train_docs;
cout << "p_abusive = " << p_abusive << endl;
//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs
p0vect.resize(train_mat[0].size(), 1);
//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs
p1vect.resize(train_mat[0].size(), 1);
printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());
float p0Denom = 2.0; //the total number of words in non-abusive docs
float p1Denom = 2.0; //the total number of words in abusive docs
/* calculate the p0num,p1num,p0Denom,p1Denom */
for (int i = 0; i<list_classes.size(); i++)
{
if (list_classes[i] == 1) //abusive doc
{
for (int j = 0; j<p1vect.size(); j++)
{
p1vect[j] += train_mat[i][j];
if (train_mat[i][j] == 1)
p1Denom++;
}
}
else //non-abusive doc
{
for (int j = 0; j<p0vect.size(); j++)
{
p0vect[j] += train_mat[i][j];
if (train_mat[i][j] == 1)
p0Denom++;
}
}
}
for (int i = 0; i<p1vect.size(); i++)
{
p0vect[i] = log(p0vect[i] / p0Denom);
p1vect[i] = log(p1vect[i] / p1Denom);
}
cout << "print the p0vect values : " << endl;
for (int i = 0; i<p0vect.size(); i++)
cout << p0vect[i] << " ";
cout << "\nprint the p1vect values : " << endl;
for (int i = 0; i<p1vect.size(); i++)
cout << p1vect[i] << " ";
cout << endl;
}
int NaiveBayes::classify_NB(string *doc_to_classify)
{
return_vec = new int[my_vocab_list.size() + 1]();
for (int i = 0; doc_to_classify[i] != "null"; i++)
{
int pos = my_vocab_list[doc_to_classify[i]];
if (pos != 0)
{
return_vec[pos] = 1;
}
}//for
for (int i = 0; i<my_vocab_list.size() + 1; i++)
cout << return_vec[i] << " ";
cout << endl;
float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);
float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);
cout << "p1 = " << p1 << endl;
cout << "p0 = " << p0 << endl;
if (p1>p0)
{
return 1;
}
else
{
return 0;
}
}
//main.cpp
#include"stdafx.h"
#include"stdlib.h "
#include<iostream>
#include"NaiveBayes.h"
using namespace std;
int main()
{
NaiveBayes nb;
nb.create_vocab_list();
nb.get_train_matrix();
nb.print();
nb.train_NB0();
string doc1_to_classify[] = { "love", "my", "dalmation", "null" };
string doc2_to_classify[] = { "stupid", "garbage", "null" };
cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;
cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;
system("pause");
return 0;
}