forked from pepesan/machine-learning-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_04_titanic.py
121 lines (93 loc) · 3.6 KB
/
02_04_titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
# import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn import datasets, svm, model_selection, tree, preprocessing, metrics
import sklearn.ensemble as ske
#pip install tensorflow
import tensorflow as tf
#pip install git+https://github.com/google/skflow.git
#from tensorflow.contrib import skflow
#Importamos el modelo
# pip install xlrd
titanic_df = pd.read_excel('./csv/titanic3.xls', 'titanic3', index_col=None, na_values=['NA'])
#Hay que fijarse cómo se vectorizan los datos por ejemplo survival y class
"""
survival: Survival (0 = no; 1 = yes)
class: Passenger class (1 = first; 2 = second; 3 = third)
name: Name
sex: Sex
age: Age
sibsp: Number of siblings/spouses aboard
parch: Number of parents/children aboard
ticket: Ticket number
fare: Passenger fare
cabin: Cabin
embarked: Port of embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat: Lifeboat (if survived)
body: Body number (if did not survive and body was recovered)
"""
print(titanic_df.head(10))
print("Agrupados por clase")
print(titanic_df.groupby('pclass').mean())
class_sex_grouping = titanic_df.groupby(['pclass','sex']).mean()
print("agrupados por clase y sexo")
print(class_sex_grouping)
class_sex_grouping['survived'].plot.bar()
group_by_age = pd.cut(titanic_df["age"], np.arange(0, 90, 10))
age_grouping = titanic_df.groupby(group_by_age).mean()
age_grouping['survived'].plot.bar()
print("Imprimimos cuenta de datos disponibles")
print(titanic_df.count())
#Quitamos datos no interesantes
titanic_df = titanic_df.drop(['body','cabin','boat'], axis=1)
titanic_df["home.dest"] = titanic_df["home.dest"].fillna("NA")
titanic_df = titanic_df.dropna()
print("Imprimimos cuenta de datos disponibles tras quitarlos")
print(titanic_df.count())
#Función de preprocesador de datos
def preprocess_titanic_df(df):
processed_df = df.copy()
le = preprocessing.LabelEncoder()
processed_df.sex = le.fit_transform(processed_df.sex)
processed_df.embarked = le.fit_transform(processed_df.embarked)
processed_df = processed_df.drop(['name','ticket','home.dest'],axis=1)
return processed_df
processed_df = preprocess_titanic_df(titanic_df)
#datos sin survived
X = processed_df.drop(['survived'], axis=1).values
#sólo los datos de survived
y = processed_df['survived'].values
#extracción de datos de entrenamiento y pruebas
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2)
#Creación del algoritmo de arbol de decisiones
clf_dt = tree.DecisionTreeClassifier(max_depth=10)
#Entrenamiento
clf_dt.fit (X_train, y_train)
#Puntuación
print("puntuación")
print(clf_dt.score(X_test, y_test))
shuffle_validator = model_selection.ShuffleSplit(len(X), test_size=0.2, random_state=0)
def test_classifier(clf):
scores = model_selection.cross_val_score(clf, X, y, cv=shuffle_validator)
print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
#puntuación tras randomización
print("puntuación random")
print(test_classifier(clf_dt))
clf_rf = ske.RandomForestClassifier(n_estimators=50)
print("puntuación random Forest")
test_classifier(clf_rf)
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
print("puntuación gradient Boosting")
test_classifier(clf_gb)
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
print("puntuación clasificador de votos")
test_classifier(eclf)
"""
tf_clf_dnn = tf.estimator.DNNClassifier(hidden_units=[20, 40, 20], n_classes=2, feature_columns='survived')
tf_clf_dnn.train(X_train, y_train)
score = tf_clf_dnn.score(X_test, y_test)
print("puntuación red neuronal")
print(score)
"""