How To Implement Naive Bayes From Scratch in Python ?
#How To Implement Naive Bayes From Scratch in Python#http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/#Dataset#https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.dataimport csvimport mathimport random#Handle datadef loadCsv(filename):lines = csv.reader(open(filename, "r"))dataset = list(lines)for i in range(len(dataset)):dataset[i] = [float(x) for x in dataset[i]]return dataset#Test handling data"""filename = 'pima-indians-diabetes.data.csv'SomeDataset = loadCsv(filename)print("Loaded data file {0:s} with {1:5d} rows".format(filename,len(SomeDataset)))"""#Split dataset with ratiodef splitDataset(dataset, splitRatio):trainSize = int(len(dataset) * splitRatio)trainSet = []copy = list(dataset)while len(trainSet) < trainSize:index = random.randrange(len(copy))trainSet.append(copy.pop(index))return [trainSet, copy]#Test splitting data"""dataset = [[1], [2], [3], [4], [5]]splitRatio = 0.67train, test = splitDataset(dataset, splitRatio)print('Split {0} rows into train with {1} and test with {2}'.format(len(dataset),train,test))"""#Separate by Classdef separateByClass(dataset):separated = {}for i in range(len(dataset)):vector = dataset[i]if (vector[-1] not in separated):separated[vector[-1]] = []separated[vector[-1]].append(vector)return separated#Test separating by class"""dataset = [[1,20,1],[2,21,0],[3,22,1]]separated = separateByClass(dataset)print('Separated instances: {0}'.format(separated))"""#Calculate Meandef mean(numbers):return sum(numbers)/float(len(numbers))def stdev(numbers):avg = mean(numbers)variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)return math.sqrt(variance)#Test stdev & mean calculation"""numbers = [1,2,3,4,5]print('Summary of {0}: mean={1}, stdev={2}'.format(numbers, mean(numbers), stdev(numbers)))"""#Summarize Datasetdef summarize(dataset):summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]del summaries[-1]return summaries#Test summarizing data"""dataset = [[1,20,0], [2,21,1], [3,22,0]]summary = summarize(dataset)print('Attribute summaries: {0}'.format(summary))"""#Summarize attributes by classdef summarizeByClass(dataset):separated = separateByClass(dataset)summaries = {}for classValue, instances in separated.items():summaries[classValue] = summarize(instances)return summaries#Test summarizing attributes"""dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]summary = summarizeByClass(dataset)print('Summary by class value: {0}'.format(summary))"""#Calculate Gaussian Probability Density Functiondef calculateProbability(x, mean, stdev):exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))return (1/(math.sqrt(2*math.pi)*stdev))*exponent#Testing Gaussing PDF"""x = 71.5mean = 73stdev = 6.2probability = calculateProbability(x,mean,stdev)print('Probability of belonging to this class: {0}'.format(probability))"""#Calculate Class Probabilitiesdef calculateClassProbabilities(summaries, inputVector):probabilities = {}for classValue, classSummaries in summaries.items():probabilities[classValue] = 1for i in range(len(classSummaries)):mean, stdev = classSummaries[i]x = inputVector[i]probabilities[classValue] *= calculateProbability(x, mean, stdev)return probabilities#Testing Class Probability calculation"""summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}inputVector = [1.1, '?']probabilities = calculateClassProbabilities(summaries, inputVector)print('Probabilities for each class: {0}'.format(probabilities))"""#Make a predictiondef predict(summaries, inputVector):probabilities = calculateClassProbabilities(summaries, inputVector)bestLabel, bestProb = None, -1for classValue, probability in probabilities.items():if bestLabel is None or probability > bestProb:bestProb = probabilitybestLabel = classValuereturn bestLabel#Test prediction"""summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}inputVector = [1.1, '?']result = predict(summaries, inputVector)print('Prediction: {0}'.format(result))"""#Get predictionsdef getPredictions(summaries, testSet):predictions = []for i in range(len(testSet)):result = predict(summaries, testSet[i])predictions.append(result)return predictions#Test predictions"""summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}testSet = [[1.1,'?'], [19.1,'?']]predictions = getPredictions(summaries, testSet)print('Predictions: {0}',format(predictions))"""#Get Accuracydef getAccuracy(testSet, predictions):correct = 0for x in range(len(testSet)):if testSet[x][-1] == predictions[x]:correct += 1return (correct/float(len(testSet)))*100.0#Test accuracy"""testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]predictions = ['a', 'a', 'a']accuracy = getAccuracy(testSet, predictions)print('Accuracy: {0}'.format(accuracy))"""def main():filename = 'pima-indians-diabetes.data.csv'splitRatio = 0.67dataset = loadCsv(filename)trainingSet, testSet = splitDataset(dataset, splitRatio)print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset),len(trainingSet),len(testSet)))#prepare modelsummaries = summarizeByClass(trainingSet)#test modelpredictions = getPredictions(summaries, testSet)accuracy = getAccuracy(testSet, predictions)print('Accuracy: {0}%'.format(accuracy))main()