Import Necessary Libraries
import csv
import tensorflow as tf
import numpy as np
import random
import sys
import pandas as pd
from pandas import DataFrame
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
Read Source File and Display
In this section, we’ll work with basic CSV operations and display the results. We’ll read the train.csv file downloaded from Kaggle and show its contents.
trainFilePath = './train.csv'
trainSize = 0
def testCSV(filePath):
with open(filePath, 'rb') as trainFile:
global trainSize
csvReader = csv.reader(trainFile)
dataList = [data for data in csvReader]
df = DataFrame(dataList[1:], columns=dataList[0])
trainSize = len(df)
print(df)
print("trainSize", trainSize)
testCSV(trainFilePath)
Read Source File, Extract Data, and Build Neural Network
In this section, we’ll extract gender, class, ticket fare, and SibSp from the source file to fit the survival probability. Then we’ll build a 5-layer neural network with 3 hidden layers containing 4-10-20-10-2 neurons respectively. Finally, we’ll execute the reading function.
def readTrainDataCSV(filePath):
global trainData, targetData, classifier
with open(filePath, 'rb') as trainFile:
csvReader = csv.reader(trainFile)
dataList = [data for data in csvReader]
dataSize = len(dataList) - 1
trainData = np.ndarray((dataSize, 4), dtype=np.float32)
targetData = np.ndarray((dataSize, 1), dtype=np.int32)
trainDataFrame = DataFrame(dataList[1:], columns=dataList[0])
trainDataFrame_fliter = trainDataFrame.loc[:,['Pclass','Sex','SibSp','Fare','Survived']]
for i in range(dataSize):
thisData = np.array(trainDataFrame_fliter.iloc[i])
Pclass,Sex,SibSp,Fare,Survived = thisData
Pclass = float(Pclass)
Sex = 0 if Sex == 'female' else 1
SibSp = float(SibSp)
Fare = float(Fare)
Survived = int(Survived)
print(Pclass,Sex,SibSp,Fare,Survived)
trainData[i,:] = [Pclass,Sex,SibSp,Fare]
targetData[i,:] = [Survived]
print(thisData)
print(trainData)
print(targetData)
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=2)
# model_dir="/tmp/titanic_model")
readTrainDataCSV(trainFilePath)
Create Input Data
We’ll wrap the training data and labels into a tuple and return it.
def get_train_inputs():
x = tf.constant(trainData)
y = tf.constant(targetData)
print(x)
print(y)
return x, y
get_train_inputs()
Train the Model
Now we start training the neural network.
def train():
classifier.fit(input_fn=get_train_inputs, steps=2000)
train()
Check Accuracy
We use the entire dataset to check accuracy. Note that we should use a validation set for this task, but since this is just for demonstration purposes, we’ll skip that step.
accuracy_score = classifier.evaluate(input_fn=get_train_inputs,
steps=1)["accuracy"]
print("accuracy:",accuracy_score)
Read Test Set and Output Results
In this section, we’ll read the test data from Kaggle and output the results to a file, which will ultimately be submitted to the official website.
testFilePath = './test.csv'
def readTestDataCSV(filePath):
global testData, PassengerIdStart
with open(filePath, 'rb') as testFile:
csvReader = csv.reader(testFile)
dataList = [data for data in csvReader]
dataSize = len(dataList)-1
trainDataFrame = DataFrame(dataList[1:], columns=dataList[0])
trainDataFrame_fliter = trainDataFrame.loc[:,['Pclass','Sex','SibSp','Fare']]
testData = np.ndarray((dataSize, 4), dtype=np.float32)
PassengerIdStart = trainDataFrame['PassengerId'][0]
PassengerIdStart = int(PassengerIdStart)
print('PassengerId',PassengerIdStart)
for i in range(dataSize):
thisData = np.array(trainDataFrame_fliter.iloc[i])
Pclass,Sex,SibSp,Fare = thisData
Pclass = float(Pclass)
Sex = 0 if Sex == 'female' else 1
SibSp = float(SibSp)
Fare = 0 if Fare=='' else float(Fare)
print(Pclass,Sex,SibSp,Fare)
testData[i,:] = [Pclass,Sex,SibSp,Fare]
print(thisData)
print(testData)
def testData_samples():
return testData
readTestDataCSV(testFilePath)
predictions = list(classifier.predict(input_fn=testData_samples))
print(predictions)
with open('predictions.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, dialect='excel')
writer.writerow(['PassengerId','Survived'])
PassengerId = PassengerIdStart
for i in predictions:
writer.writerow([PassengerId, i])
PassengerId += 1
Finally, using only 4 features, we achieved an accuracy of 75%. The next goal is to utilize the other available data.