-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
116 lines (96 loc) · 3.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import json
import lizard
import logging
import os
import shutil
import stat
from scraper import PythonScraper
from git import Repo
def del_rw(action, name, exc):
os.chmod(name, stat.S_IWRITE)
os.remove(name)
def parsearg():
parser = argparse.ArgumentParser()
parser.add_argument("--scrape", action="store_true")
parser.add_argument("--analyze", action="store_true")
return parser.parse_args()
def printfilescsv(language, repo, lfiles):
repo = repo.replace('/', '_')
PATH = "./data/{}/{}.csv".format(language, repo)
os.makedirs(os.path.dirname(PATH), exist_ok=True)
with open(PATH, "w+") as csv_file:
csv_file.write("File, Function, Length\n")
for lfile in lfiles:
for func in lfile.function_list:
csv_file.write("{},\"{}\",{}\n".format(
lfile.filename,
func.long_name.replace("\"", "\'"),
func.length))
def main():
parser = parsearg()
# "Constants" that we need
KEY_FILE = "keys.json"
DATA_FILE= "data.json"
REPOS_DIR = "repos/"
NUM_TO_SCRAPE = 1000
LANGUAGES = ["python"]
# Configure the logging
logging.basicConfig(
filename="log.txt",
level=logging.INFO,
format='L %(asctime)s %(message)s',
datefmt='%Y-%m-%d-%H-%M-%S'
)
# If we want to scrape
if(parser.scrape):
with open(KEY_FILE, 'r') as key_file:
KEYS = json.load(key_file)
if KEYS["client_id"] is "" or KEYS["client_secret"] is "":
print("Please add your GitHub Client ID and Client Secret to keys.json")
return
scraper = PythonScraper(KEYS)
for language in LANGUAGES:
scraper.getTopRepos(language, NUM_TO_SCRAPE)
repos = scraper.getRepos()
for language in repos:
for repo in repos[language]:
logging.info("{} {}".format(
language,
repos[language][repo]["html_url"]))
# let's save what we have
data = json.dumps(repos, indent=4)
with open(DATA_FILE, 'w') as data_file:
data_file.write(data)
# If we want to analyze
if parser.analyze:
# If we are only analyzing
if not parser.scrape:
with open(DATA_FILE) as data_file:
repos = json.load(data_file)
extensions = ["py"]
for language in repos:
for repo in repos[language]:
lfiles = []
repo_name = repo.split('/')[1]
repo_dest = REPOS_DIR + repo_name
logging.info("Cloning {}".format(repo))
r = Repo.clone_from(repos[language][repo]["html_url"], repo_dest)
print("Analyzing {}".format(repo_dest))
for (root, subdir, files) in os.walk(repo_dest):
for file in files:
fullpath = os.path.join(root, file)
if extensions:
for extension in extensions:
if fullpath.endswith(extension):
lfile = lizard.analyze_file(fullpath)
lfiles.append(lfile)
else:
lfile = lizard.analyze_file(fullpath)
lfiles.append(lfile)
r.close()
printfilescsv(language, repo, lfiles)
logging.info("Deleting {}".format(repo))
shutil.rmtree(repo_dest, onerror=del_rw)
if __name__ == "__main__":
main()