forked from acmpesuecc/CVpedia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
111 lines (88 loc) · 3.02 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
from flask import Flask, render_template, url_for
app = Flask(__name__)
# Checking for the total no. of pages
url = 'https://timesofindia.indiatimes.com/topic/coronavirus-india'
soup = BeautifulSoup(get(url).text, 'lxml')
##Because the website displays ages only till 20
max_urls = [url + str(i) for i in range(1, 21)]
# Creating empty lists to save all the features
headlines, dates, news, urls = [], [], [], []
print("[INFO] Extracting links...")
# Extracting all the Headlines, dates and the urls of the articles
for index in max_urls:
try:
soup = BeautifulSoup(get(index).text, 'lxml')
# Extracts the Headlines
try:
headline = [soup.select('span.title')[i].text.strip() for i in range(len(soup.select('span.title')))]
#print(headline)
headlines.extend(headline)
except:
headlines.extend(None)
# Extracts the published dates
try:
pub_date = [str(parser.parse(soup.select('span.meta')[0].text)).split()[0] for i in
range(len(soup.select('span.meta')))]
dates.extend(pub_date)
except:
dates.extend(None)
# Extracts the urls
try:
source = ['https://timesofindia.indiatimes.com' + soup.select('.content')[i].a['href'] for i in
range(len(soup.select('span.meta')))]
urls.extend(source)
except:
urls.extend(None)
except:
break
print("[INFO] Links Extracted.")
print("The total no. of pages is=", len(urls))
# print(set(dates))
print("No. articles=", len(dates))
print("Last article goes back till: ", min(dates))
print("[INFO] Extracting articles...")
c = 0
for index in tqdm(urls):
try:
# Parse the url to NewsPlease
soup = BeautifulSoup(get(index).text, 'lxml')
# Extracts the news articles
try:
news_article = ''.join(
i for i in ' '.join(soup.select_one('._3WlLe').text.split()) if i in string.printable)
c += 1
print(c)
news.append(news_article)
except:
news.append(None)
except:
news.append(None)
print("[INFO] Articles Extracted.")
df = pd.DataFrame({'Headlines': headlines,
'Article': news,
'Published_Dates': dates,
'Source_URLs': urls
})
print(df.head(5))
headlines=list(df.head(10)['Headlines'])
sources=list(df.head(10)['Source_URLs'])
dates=list(df.head(10)['Published_Dates'])
print(headlines)
@app.route('/news.html')
def news():
return render_template('news.html', headlines=headlines, sources=sources, dates=dates)
@app.route('/')
@app.route('/index.html')
def index():
return render_template('index.html')
@app.route('/contact.html')
def contact():
return render_template('contact.html')
if __name__ == '__main__':
app.run(debug=True)