貝葉斯實現文字分類C++實現

kkkkkkkkq發表於2017-08-05
//NaiveBayes.h
#ifndef NAIVEBAYES_H_
#define NAIVEBAYES_H_
#include<iostream>
#include<map>
#include<set>
#include<cmath>
#include<vector>
#include<algorithm>
#include<numeric>
#include<cstring>
#include<stdio.h>
#include<cstdlib>
using namespace std;


class NaiveBayes
{
private:
	vector< vector<string> > list_of_posts;
	vector<int> list_classes;
	map<string, int>  my_vocab_list;
	int *return_vec;
	vector< vector<int> > train_mat;
	vector<float> p0vect;
	vector<float> p1vect;
	float p_abusive;


public:
	NaiveBayes();
	void create_vocab_list();//create_vocab_list
	void set_of_words_to_vec(int idx);
	void get_train_matrix();
	void print();
	void train_NB0();
	int classify_NB(string *doc_to_classify);
	
};
#endif // !NAIVEBAYES_H_

//NaiveBayes.cpp
#include"stdafx.h"
#include"NaiveBayes.h"

string posting_list[6][10] = 
{
	{ "my", "dog", "has", "flea", "problems", "help", "please", "null" },
	{ "maybe", "not", "take", "him", "to", "dog", "park", "stupid", "null" },
	{ "my", "dalmation", "is", "so", "cute", "I", "love", "him", "null" },
	{ "stop", "posting", "stupid", "worthless", "garbage", "null" },
	{ "mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him", "null" },
	{ "quit", "buying", "worthless", "dog", "food", "stupid", "null" }
};
int class_vec[6] = { 0, 1, 0, 1, 0, 1 };//1 is abusive ,0 not


NaiveBayes::NaiveBayes()
{
	vector<string> vec;
	for (int i = 0; i<6; i++)
	{
		vec.clear();
		for (int j = 0; posting_list[i][j] != "null"; j++)
		{
			vec.push_back(posting_list[i][j]);
		}
		list_of_posts.push_back(vec);
	}

	for (int i = 0; i<sizeof(class_vec) / sizeof(class_vec[0]); i++)
	{
		list_classes.push_back(class_vec[i]);
	}

}

void NaiveBayes::create_vocab_list()
{
	vector< vector<string> > ::iterator it = list_of_posts.begin();
	int index = 1;
	while (it != list_of_posts.end())
	{
		//vector<string> vec( *it.begin(),*it.end() );
		vector<string> vec = *it;

		vector<string> ::iterator tmp_it = vec.begin();

		while (tmp_it != vec.end())
		{
			//cout<<*tmp_it<<" ";
			if (my_vocab_list[*tmp_it] == 0)
			{
				my_vocab_list[*tmp_it] = index++; //index is the location of the vovabulary
			}
			tmp_it++;
		}
		it++;
	}

}//create_vocab_list

//set some one word to vec with 0 and 1.
void NaiveBayes::set_of_words_to_vec(int idx)
{
	cout << "set of words to vec begin the document id is : " << idx << endl;
	int len = my_vocab_list.size() + 1;
	return_vec = new int[len](); //pay attention to the difference between "new int[len]". initalize all the element to zero.
	fill(return_vec, return_vec + len, 0);
	for (int i = 0; i<len; i++)
		cout << return_vec[i] << " ";
	for (int i = 0; posting_list[idx][i] != "null"; i++)
	{
		//cout<<posting_list[idx][i]<<" ";
		int pos = my_vocab_list[posting_list[idx][i]];
		if (pos != 0)
		{
			return_vec[pos] = 1;
		}
	}
	cout << endl;
}//set_of_words_to_vec

void NaiveBayes::get_train_matrix()
{
	cout << "get train matrix begin : " << endl;
	train_mat.clear();
	for (int i = 0; i<6; i++)
	{
		set_of_words_to_vec(i);
		vector<int> vec(return_vec, return_vec + my_vocab_list.size() + 1);
		train_mat.push_back(vec);
		delete[]return_vec;
	}
}//get train matrix

void NaiveBayes::print()
{
	cout << "print the train matrix begin : " << endl;
	vector< vector<int> > ::iterator it = train_mat.begin();
	while (it != train_mat.end())
	{
		vector<int> vec = *it;
		vector<int> ::iterator itt = vec.begin();
		while (itt != vec.end())
		{
			cout << *itt << " ";
			itt++;
		}
		cout << endl;
		it++;
	}

}//print()

void NaiveBayes::train_NB0()
{
	int num_train_docs = train_mat.size();//sizeof(posting_lists)/sizeof(posting_lists[0]);
	cout << "num_train_docs = " << num_train_docs << endl;
	int num_words = train_mat[0].size() - 1;
	/* calculatr the sum of the abusive classes */
	int sum = accumulate(list_classes.begin(), list_classes.end(), 0);

	cout << "sum = " << sum << endl;
	//float p_abusive = (float)sum/(float)num_train_docs;
	p_abusive = (float)sum / (float)num_train_docs;
	cout << "p_abusive = " << p_abusive << endl;

	//vector<float> p0vect(train_mat[0].size(),1); //the frequency of each word in non-absusive docs
	p0vect.resize(train_mat[0].size(), 1);
	//vector<float> p1vect(train_mat[0].size(),1); //the frequency of each word in abusive docs
	p1vect.resize(train_mat[0].size(), 1);
	printf("p0num.size() = %d , p1num.size() = %d\n", p0vect.size(), p1vect.size());
	float p0Denom = 2.0; //the total number of words in non-abusive docs
	float p1Denom = 2.0; //the total number of words in abusive docs

	/* calculate the p0num,p1num,p0Denom,p1Denom */
	for (int i = 0; i<list_classes.size(); i++)
	{
		if (list_classes[i] == 1)  //abusive doc
		{
			for (int j = 0; j<p1vect.size(); j++)
			{
				p1vect[j] += train_mat[i][j];
				if (train_mat[i][j] == 1)
					p1Denom++;
			}
		}
		else   //non-abusive doc
		{
			for (int j = 0; j<p0vect.size(); j++)
			{
				p0vect[j] += train_mat[i][j];
				if (train_mat[i][j] == 1)
					p0Denom++;
			}
		}
	}

	for (int i = 0; i<p1vect.size(); i++)
	{
		p0vect[i] = log(p0vect[i] / p0Denom);
		p1vect[i] = log(p1vect[i] / p1Denom);
	}

	cout << "print the p0vect values : " << endl;
	for (int i = 0; i<p0vect.size(); i++)
		cout << p0vect[i] << " ";
	cout << "\nprint the p1vect values : " << endl;
	for (int i = 0; i<p1vect.size(); i++)
		cout << p1vect[i] << " ";
	cout << endl;
}

int NaiveBayes::classify_NB(string *doc_to_classify)
{
	return_vec = new int[my_vocab_list.size() + 1]();
	for (int i = 0; doc_to_classify[i] != "null"; i++)
	{
		int pos = my_vocab_list[doc_to_classify[i]];
		if (pos != 0)
		{
			return_vec[pos] = 1;
		}
	}//for

	for (int i = 0; i<my_vocab_list.size() + 1; i++)
		cout << return_vec[i] << " ";
	cout << endl;
	float p1 = inner_product(p1vect.begin() + 1, p1vect.end(), return_vec + 1, 0) + log(p_abusive);
	float p0 = inner_product(p0vect.begin() + 1, p0vect.end(), return_vec + 1, 0) + log(1 - p_abusive);

	cout << "p1 = " << p1 << endl;
	cout << "p0 = " << p0 << endl;

	if (p1>p0)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

//main.cpp
#include"stdafx.h"
#include"stdlib.h "
#include<iostream>
#include"NaiveBayes.h"
using namespace std;

int main()
{
	NaiveBayes nb;
	nb.create_vocab_list();
	nb.get_train_matrix();
	nb.print();
	nb.train_NB0();

	string doc1_to_classify[] = { "love", "my", "dalmation", "null" };
	string doc2_to_classify[] = { "stupid", "garbage", "null" };
	cout << "doc1 classified as : " << nb.classify_NB(doc1_to_classify) << endl;
	cout << "doc2 classified as : " << nb.classify_NB(doc2_to_classify) << endl;
	system("pause");
	return 0;
}



相關文章