-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
178 lines (130 loc) · 5.24 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
# https://stackoverflow.com/a/2429517/
# Load Primary and Secondary Attribute Data
# These data contain monadic (individual-level) attribute data for each
# villager in each of the 16 villages wherein network data were collected.
# The variables/features used for the analysis are as follows:
# female: 1 == Female; 0 == Male
# hh: The ID of each villager's household
# Primary Attribute Data
survey_responses_AJPS = pd.read_csv(
filepath_or_buffer = "nodes.csv",
header = 0,
keep_default_na = True,
# nrows = 10
)
survey_responses_AJPS = survey_responses_AJPS.loc[:, [ "villageId", "i", "female" ]]
survey_responses_AJPS = survey_responses_AJPS.rename(
columns = {"i": "ID", "villageId": "village_ID"}
)
# Cast data in the columns as the appropriate type and reassign.
# Note, there are missing data for some of the features/variables.
# Thus, dtype == "float64" will accommodate "NaN", Numpy style.
# However, dtype == "Int64" (note capital "I") will support NaN integer in Pandas.
# https://wesmckinney.com/book/data-cleaning.html#pandas-ext-types
survey_responses_AJPS = survey_responses_AJPS.astype(
dtype = {
"ID": "string",
"village_ID": "string",
"female": "Int64"
}
)
survey_responses_AJPS = survey_responses_AJPS.set_index("ID", drop = True)
# Secondary Attribute Data
survey_responses_CPS = pd.read_csv(
filepath_or_buffer = "nodes_CPS_Version_1.2.csv",
header = 0,
keep_default_na = True,
# nrows = 10
)
survey_responses_CPS = survey_responses_CPS.loc[:, [ "villageId", "i","villageTxt", "hh"]]
survey_responses_CPS = survey_responses_CPS.rename(
columns = {
"i": "ID",
"villageId": "village_ID",
"villageTxt": "village_ID_txt",
"hh": "HH_ID"
}
)
survey_responses_CPS = survey_responses_CPS.astype(
dtype = {
"ID": "string",
"village_ID": "string",
"village_ID_txt": "string",
"HH_ID": "string"
}
)
survey_responses_CPS = survey_responses_CPS.set_index("ID", drop = True)
# Merge/join (One-to-One) Primary Attribute Data and Secondary Attribute Data
# Merge/join operations combine datasets by linking rows using one or more keys.
# Inner joins produce the intersection of tables based on common keys (here, ID).
# https://wesmckinney.com/book/data-wrangling.html
survey_responses = pd.merge(
how = "inner",
left = survey_responses_AJPS,
right = survey_responses_CPS,
# left_on = "ID",
# right_on = "ID"
left_index = True,
right_index = True,
suffixes = ("_AJPS", "_CPS"),
indicator = False
)
del survey_responses_AJPS, survey_responses_CPS
# Duplicate columns appear in the merged dataframe when columns in
# the left/right dataframe have identical names.
del survey_responses["village_ID_CPS"]
survey_responses = survey_responses.rename(columns = {"village_ID_AJPS": "village_ID"})
# Load social network data
# These network data are arranged in a "long" or "edge list" format
# (i.e., each row of the data frame is an ordered dyad) and they
# detail information on one of six relationships between residents
# living in the same village plus within-village, pairwise geographic distance.
# Thus, villager i/i_ID is the sending/nominating actor and villager j/j_ID is
# the receiving/nominated actor. The relationship types are as follows:
# family: i named j as a member of their family.
# lender: i named j as someone who would lend them money.
# friend: i named j as a friend.
# solver: i named j as someone she would go to to solve a problem.
# Contgame: i voted for j to receive the village's money in a public goods game.
# speak: i spoke to j about UBridge (i.e., a government reporting platform)
# geo: geographic distance between i and j in meters (NA for all i, j not in same village).
# The only relationships analysed for my study are: lender, family, and friend.
# Note, rows for the different relationships are stacked such that
# an ordered dyad can appear multiple times if i/i_ID nominates j/j_ID
# in response to more than one name generator + the dyad for geographic distance.
# To see this, RUN: nominations.loc[nominations.index[nominations.index.duplicated()]]
nominations = pd.read_csv(
filepath_or_buffer = "ties.csv",
header = 0,
keep_default_na = True,
# nrows = 10
)
nominations = nominations.rename(
columns = {
"villageId": "village_ID",
"i": "i_ID",
"j": "j_ID"
}
)
nominations = nominations[nominations["type"].isin(["lender", "family", "friend"])]
del nominations["dist"]
nominations = nominations.astype(
dtype = {
"village_ID": "string",
"i_ID": "string",
"j_ID": "string",
"type": "string"
}
)
# Create new column for the unique id of each directed relationship
nominations["ij_ID"] = nominations['i_ID'] + "_" + nominations['j_ID']
nominations = nominations.set_index("ij_ID", drop = True)
# Save the types of relationships use for this analysis.
relationship_type = pd.unique(nominations["type"])
# Set aside list of the IDs of each village in the study for use later.
village_IDs = pd.unique(survey_responses["village_ID"])
# Basic information on the monadic data
print(survey_responses, "\n")
print(survey_responses.info())
# survey_responses.describe(),