-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawl_utils.py
109 lines (95 loc) · 3.38 KB
/
crawl_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
""" A collection of utilities for crawl scripts """
from StringIO import StringIO
import requests
import zipfile
import random
import shutil
import json
import glob
import os
EC2_LIST = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
def get_top_1m(location):
"""
Returns list of top 1 million sites. If no list exists
for the current day, a new one is fetched
@param location lists where raw list is cached
"""
location = os.path.expanduser(location)
site_list = os.path.join(location, 'top-1m.csv')
if not os.path.isfile(site_list):
print "%s does not exist, downloading a copy." % site_list
resp = requests.get(EC2_LIST)
with zipfile.ZipFile(StringIO(resp.content), 'r') as zpf:
contents = zpf.read(zpf.infolist()[0])
if not os.path.isdir(location):
os.makedirs(location)
with open(site_list, 'w') as f:
f.write(contents)
else:
with open(site_list, 'r') as f:
contents = f.read()
return [x.split(',')[-1] for x in contents.split('\n')]
def get_sampled_sites(location, include_rank=False,
slices=[(10000, 0, 10000),
(10000, 10000, 100000),
(15000, 100000, 1000000)]):
location = os.path.expanduser(location)
site_list = os.path.join(location, 'sampled_sites.json')
# If sampled site list exists, read and return it
if os.path.isfile(site_list):
with open(site_list, 'r') as f:
return json.load(f)
# If not, create it and return it
if not os.path.isdir(location):
os.makedirs(location)
sites = sample_top_sites(location, include_rank, slices)
with open(site_list, 'w') as f:
json.dump(sites, f)
return sites
def sample_top_sites(location, include_rank=False,
slices=[(10000, 0, 10000),
(10000, 10000, 100000),
(15000, 100000, 1000000)]):
"""
Returns a subsample of sites from the top 1 million given by `slices`
Parameters
----------
location : str
Location of top 1 million site list. If the list does not exist at this
location it will be downloaded.
include_rank : bool
Indicates whether or not to include the alexa rank in the output sample
slices : list of tuples
List of slices to sample. Each slice should be given as follows:
(# of sites, start_index, end_index)
Returns
-------
list of str or list of tuples
List of URLs sampled from the top 1m according to `slices`. If
`include_rank` is True, this returns of list of `(int: rank, str: url)`
"""
location = os.path.expanduser(location)
top_1m = get_top_1m(location)
if include_rank:
top_1m = zip(range(len(top_1m)), top_1m)
sites = list()
for sl in slices:
sites.extend(random.sample(top_1m[sl[1]:sl[2]], sl[0]))
return sites
def clear_tmp_folder():
"""
Clear the tmp folder of directories / files that
may have been missed during cleanup.
"""
tmpfiles = glob.glob('/tmp/tmp*')
for tmpfile in tmpfiles:
try:
shutil.rmtree(tmpfile)
except OSError:
pass
tmpfiles = glob.glob('/tmp/.X*-lock')
for tmpfile in tmpfiles:
try:
os.remove(tmpfile)
except OSError:
pass