-
Notifications
You must be signed in to change notification settings - Fork 2
/
transform.py
71 lines (63 loc) · 2.87 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
""" Small script that shows hot to do one hot encoding
of categorical columns in a pandas DataFrame.
See:
http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.DictVectorizer.html
"""
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.base import TransformerMixin
def one_hot_dataframe(train, test, cols, replace=True):
""" Takes train and test dataframes and a list of columns that need to be encoded.
Returns a 3-tuple comprising the one-hot encoded dataframes and the fitted vectorizor.
Modified from https://gist.github.com/kljensen/5452382
"""
vec = DictVectorizer()
vecTrain = pd.DataFrame(vec.fit_transform(train[cols].to_dict(orient='records')).toarray())
vecTest = pd.DataFrame(vec.transform(test[cols].to_dict(orient='records')).toarray())
vecTrain.columns = vec.get_feature_names()
vecTest.columns = vec.get_feature_names()
vecTrain.index = train.index
vecTest.index = test.index
if replace is True:
train = train.drop(cols, axis=1)
train = train.join(vecTrain)
test = test.drop(cols, axis=1)
test = test.join(vecTest)
return (train, test, vec)
def scale_dataframe(train, test, cols, replace=True):
""" Takes train and test dataframes and a list of columns that need to be scaled.
Returns a 3-tuple comprising the scaled dataframes and the fitted scaler.
"""
scaler = StandardScaler()
scaledTrain = pd.DataFrame(data=scaler.fit_transform(train[cols]), columns=cols)
scaledTest = pd.DataFrame(data=scaler.transform(test[cols]), columns=cols)
scaledTrain.columns = scaler.get_feature_names()
scaledTest.columns = scaler.get_feature_names()
scaledTrain.index = train.index
scaledTest.index = test.index
if replace is True:
train = train.drop(cols, axis=1)
train = train.join(scaledTrain)
test = test.drop(cols, axis=1)
test = test.join(scaledTest)
return (train, test, scaler)
class DataFrameImputer(TransformerMixin):
def __init__(self):
"""Impute missing values.
Columns of dtype object are imputed with the most frequent value
in column.
Columns of other types are imputed with mean of column.
"""
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
def impute_dataframe(train, test):
return (DataFrameImputer().fit_transform(train), DataFrameImputer().fit_transform(test))