-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path02_02_04_pandas_limpieza_2.py
79 lines (64 loc) · 1.99 KB
/
02_02_04_pandas_limpieza_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import numpy as np
# ref https://github.com/realpython/python-data-cleaning/blob/master/Datasets/BL-Flickr-Images-Book.csv
# ref blog: https://realpython.com/python-data-cleaning-numpy-pandas/
df = pd.read_csv("csv/BL-Flickr-Images-Book.csv")
print(df)
print(df.head())
print(df.info())
to_drop = ['Edition Statement',
'Corporate Author',
'Corporate Contributors',
'Former owner',
'Engraver',
'Contributors',
'Issuance type',
'Shelfmarks']
df.drop(to_drop, inplace=True, axis=1)
print("Dropped")
print(df)
# índice unico
df['Identifier'].is_unique
# usado como index del DF
# devuelve un df copiado
#df = df.set_index('Identifier')
# devuelve sobre el mismo df
df.set_index('Identifier', inplace=True)
print("indexed")
print(df)
# convertir a números una columna
extr = df['Date of Publication'].str.extract(r'^(\d{4})', expand=False)
df['Date of Publication'] = pd.to_numeric(extr)
print("mascara")
print(df['Date of Publication'].dtype)
# limpiar datos
pub = df['Place of Publication']
# mascara de si contiene londres
london = pub.str.contains('London')
oxford = pub.str.contains('Oxford')
df['Place of Publication'] = np.where(london, 'London',
np.where(oxford, 'Oxford',
pub.str.replace('-', ' ')))
print(df['Place of Publication'].head())
print(df)
## Maś limpieza
university_towns = []
with open('Datasets/university_towns.txt') as file:
for line in file:
if '[edit]' in line:
# Remember this `state` until the next is found
state = line
else:
# Otherwise, we have a city; keep `state` as last-seen
university_towns.append((state, line))
towns_df = pd.DataFrame(
university_towns,
columns=['State', 'RegionName'])
def get_citystate(item):
if ' (' in item:
return item[:item.find(' (')]
elif '[' in item:
return item[:item.find('[')]
else:
return item
towns_df = towns_df.map(get_citystate)