-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_db.py
344 lines (271 loc) · 11.2 KB
/
generate_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
'''
Converts time-series course data stored in git commit history into an SQLite database
Usage: `pipenv run cli generate [--term Fall2020] [OPTIONS]`
More Info: `pipenv run cli generate --help`
Direct Usage [not recommended]: `pipenv run generate_db.py [OPTIONS]`
'''
import sqlite3
import json
import math
import os
from os.path import join
from subprocess import PIPE, run
from datetime import datetime, timedelta
import numpy as np
import click
from settings import LIVE_DATA_ROOT, Summer2020, Fall2020, Config
'''
Logging utilities
'''
# Print message to console prefixed with `info`
def print_info(m):
click.echo(click.style('info', fg='green') + ' ' + m)
# Print message to console prefixed with `warn`
def print_warning(m):
click.echo(click.style('warn', fg='yellow') + ' ' + m)
# Print message to console prefixed with `err`
def print_error(m):
click.echo(click.style('err ', fg='red') + ' ' + m)
'''
Date utilities
'''
# Find and return the index of the nearest date in a list of items
def nearest_date(items, pivot):
time_diff = np.abs([date - pivot for date in items])
return time_diff.argmin(0)
# Round a date
def floor_date(tm):
return tm - timedelta(minutes=tm.minute % 10,
seconds=tm.second,
microseconds=tm.microsecond)
# return tm - timedelta(minutes=tm.minute,
# # hours=tm.hour % 1,
# seconds=tm.second,
# microseconds=tm.microsecond)
'''
Git / Shell utilities
'''
# Run a shell command and return its output
def run_read(cmd):
result = run(cmd, stdout=PIPE, stderr=PIPE,
universal_newlines=True, shell=True)
return result.stdout.strip()
# Run a git command
def git(cmd):
os.system(f'cd {LIVE_DATA_ROOT} && git {cmd} --quiet')
# Run and return output of `git show` (for viewing file at commit)
# Ex. `git show {sha}:data/{term}_database.json`
def gitShow(query):
return run_read(f'cd {LIVE_DATA_ROOT} && git show {query} --quiet')
'''
Git changelog generator
'''
# TODO: Convert this to a cross-platform script (probably rewrite in python)
LOG_COMMAND = '''git log \
--date=raw \
--pretty=format:'{%n "sha": "%H",%n "author": "%aN <%aE>",%n "date": "%ad",%n "message": "%f"%n},' \
$@ | \
perl -pe 'BEGIN{print "["}; END{print "]\n"}' | \
perl -pe 's/},]/}]/'
'''
def populate_changelog(settings):
rawlog = run_read(f'cd {LIVE_DATA_ROOT}/.. && ' + LOG_COMMAND)
snapshots = list(
filter(lambda c: c['message'] == 'Update-data', json.loads(rawlog)))
# TODO: make the following loops more efficient
if settings.end_sha:
for i, s in enumerate(snapshots):
if s['sha'] == settings.end_sha:
snapshots = snapshots[i:]
break
if settings.start_sha:
for i, s in enumerate(snapshots):
if s['sha'] == settings.start_sha:
snapshots = snapshots[:i]
break
return snapshots
'''
Converts cool stuff into another format
'''
class GitHistoryConverter:
# External config
settings = None
config = None
# Hooks
setup_progress = None
update_progress = None
dates = []
shas = []
abort = False
cur_date = None
cur_term = None
term_index = None
def __init__(self, settings, config, setup_progress=None, update_progress=None):
self.settings = settings
self.config = config
self.setup_progress = setup_progress
self.update_progress = update_progress
def convert(self):
print_info('Starting up data converter...')
print_info('Use Ctrl+C to gracefully exit early')
git('checkout master')
git('pull')
commits = populate_changelog(self.settings)
for commit in commits:
timestamp = int(commit['date'].replace(' +0000', ''))
self.dates.append(datetime.fromtimestamp(timestamp))
self.shas.append(commit['sha'])
for i, (name, term) in enumerate(self.settings.term_codes.items()):
# Currently, abort acts like "skip ahead"
self.abort = False
self.cur_date = None
self.cur_term = name.upper()
self.term_index = i
self.parse_term(term)
print_info('Data converter has finished!')
def setup_db(self, term: str):
conn = sqlite3.connect(f'db/temp_{term}.sqlite3')
c = conn.cursor()
alreadyExists = False
if not self.config.full_reset:
tableExists = c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="classes"').fetchone()
if tableExists and tableExists[0]:
alreadyExists = True
if not alreadyExists:
c.execute('DROP TABLE IF EXISTS classes')
c.execute('''CREATE TABLE classes (
time TIMESTAMP,
CRN INT,
status TEXT,
seats INT,
wait_seats INT,
wait_cap INT
)''')
c.execute('CREATE UNIQUE INDEX time_crn ON classes (time, CRN)')
if self.setup_meta_table(c):
meta = self.read_meta(c)
else:
meta = None
return (conn, c, alreadyExists, meta)
def setup_meta_table(self, c: sqlite3.Cursor):
alreadyExists = False
tableExists = c.execute('SELECT name FROM sqlite_master WHERE type="table" AND name="meta"').fetchone()
if tableExists and tableExists[0]:
alreadyExists = True
if not alreadyExists:
c.execute('DROP TABLE IF EXISTS meta')
c.execute('''CREATE TABLE meta (
key TEXT UNIQUE,
value TEXT
)''')
return alreadyExists
def write_meta(self, c: sqlite3.Cursor):
c.execute('INSERT OR REPLACE INTO meta VALUES(?, ?)', ['interval', self.config.interval_time])
def read_meta(self, c: sqlite3.Cursor):
rows = c.execute('SELECT * FROM meta').fetchall()
metadata = {row[0]:row[1] for row in rows}
return metadata
def parse_term(self, term):
(conn, c, tableExists, meta) = self.setup_db(term)
start_date = None
if not self.config.full_reset and tableExists:
latest_item = c.execute('SELECT * FROM classes ORDER BY time DESC LIMIT 1').fetchone()
if latest_item:
start_date = datetime.fromisoformat(latest_item[0])
print_info(f'Skipping full reset')
click.echo(f' {click.style("Start Date:", dim=True)} {start_date}')
if meta and (int(meta['interval']) != self.config.interval_time):
print_warning(f'Ignoring specified interval time ({self.config.interval_time} min)')
print_warning(f'Using the existing time instead ({meta["interval"]} min)')
self.config.interval_time = int(meta['interval'])
cmds = []
try:
self.loop(term, start_date, cmds)
except KeyboardInterrupt:
print(f'\r{self.fterm()} Exited early at', self.cur_date, ' ', end='\r\n')
self.abort = True
finally:
MSG_WRITING = 'Inserting rows into DB...'
MSG_WROTE = 'Finished writing to DB '
print(f'{self.fterm()} ' + MSG_WRITING, end='\r')
# try:
c.executemany('INSERT INTO classes VALUES(?, ?, ?, ?, ?, ?) ON CONFLICT DO NOTHING', cmds)
# except sqlite3.IntegrityError:
# click.echo('uh oh', err=True)
# pass
self.write_meta(c)
conn.commit()
conn.close()
print(f'{self.fterm()} {MSG_WROTE}')
def loop(self, term, start, cmds):
dlatest = floor_date(self.dates[0])
dfirst = start or floor_date(self.dates[-1])
interval = self.config.interval_time
delta = (dlatest - dfirst).total_seconds() / 60
iter_count = math.ceil(delta / interval)
click.echo(f'{self.fterm()} Analyzing term {click.style(term, bold=True)}')
click.echo(f' {click.style("Start:", dim=True)} {dfirst}')
click.echo(f' {click.style("End: ", dim=True)} {dlatest}')
click.echo(f' {click.style("Data: ", dim=True)} {iter_count} chunks {click.style("of", dim=True)} {interval} min')
self.cur_date = dfirst
if self.setup_progress:
self.setup_progress(self.cur_term)
with click.progressbar(
range(iter_count),
fill_char='█',
bar_template=" [%(bar)s] %(info)s"
) as bar:
for i in bar:
if self.abort:
break
nearest_date_index = nearest_date(self.dates, self.cur_date)
# Disable progress bar and enable the following for advanced debugging
# print(f'{self.fterm()} Analyzing Commit: i =',
# nearest_date_index, self.shas[nearest_date_index][0:6], end='\r')
self.git_magic(term, cmds, self.shas[nearest_date_index], self.cur_date)
self.cur_date += timedelta(minutes=interval)
if self.update_progress:
self.update_progress((i + 1) / iter_count)
def git_magic(self, term, cmds, sha, date):
db = json.loads(gitShow(f'{sha}:data/{term}_database.json'))
for dept in db:
table = db[dept]['1']
for courses in table.values():
for ccc in courses.values():
cl = ccc[0]
data = [
date,
cl['CRN'],
cl['status'],
cl['seats'],
cl['wait_seats'],
cl['wait_cap'],
]
cmds.append(data)
def fterm(self):
name = self.cur_term.ljust(4)
color = self.settings.term_colors[self.term_index]
return click.style(name, fg=color)
TERM_NAMES_TO_CONFIG = {
'summer2020': Summer2020,
'fall2020': Fall2020,
}
DEFAULT_TERM = 'fall2020'
def setup_cmd(ctx, name=None):
choices = click.Choice(choices=TERM_NAMES_TO_CONFIG.keys(), case_sensitive=False)
@ctx.command(name)
@click.option('--term', '-t', type=choices, default=DEFAULT_TERM,
metavar='<quarter><year>', help='The term to generate data for, such as "Fall2020"')
@click.option('--interval-time', '-i', type=click.IntRange(5, 60), default=60,
metavar='<minutes>', help='The interval to generate time-series data')
@click.option('--skip-reset', is_flag=True, default=False,
help='Only add new data to existing DBs')
def convert(term: str, interval_time: int, skip_reset: bool):
"""Convert Git history repos into Sqlite3 database."""
config = Config(interval_time, not skip_reset)
converter = GitHistoryConverter(TERM_NAMES_TO_CONFIG[term.lower()], config)
converter.convert()
return convert
if __name__ == '__main__':
cmd = setup_cmd(click)
cmd() # pylint: disable=no-value-for-parameter