forked from cocoquiet/cse-notice-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
108 lines (79 loc) · 4.04 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup, PageElement
import json
from notice import *
URLs = {
'전체': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1',
'일반공지': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EC%9D%BC%EB%B0%98%EA%B3%B5%EC%A7%80',
'학사': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%ED%95%99%EC%82%AC',
'장학': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EC%9E%A5%ED%95%99',
'심컴': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EC%8B%AC%EC%BB%B4',
'글솝': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EA%B8%80%EC%86%9D',
'대학원': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EB%8C%80%ED%95%99%EC%9B%90',
'대학원 계약학과': 'https://computer.knu.ac.kr/bbs/board.php?bo_table=sub5_1&sca=%EB%8C%80%ED%95%99%EC%9B%90+%EA%B3%84%EC%95%BD%ED%95%99%EA%B3%BC'
}
CATEGORY_ALIAS = {
'전체' : 'ALL',
'일반공지' : 'NORMAL',
'학사' : 'STUDENT',
'장학' : 'SCHOLARSHIP',
'심컴' : 'SIM_COM',
'글솝' : 'GL_SOP',
'인컴' : 'SIM_COM',
'대학원' : 'GRADUATE_SCHOOL',
'대학원 계약학과' : 'GRADUATE_CONTRACT'
}
MAX_NOTICE_SIZE = 15
class Crawler:
def __parse_notice_total_count(self) -> int:
response = requests.get(URLs['전체'])
soup = BeautifulSoup(response.text, 'html.parser')
return int(soup.select_one('tbody tr:not(.bo_notice) td.td_num2').text.strip())
def __parse_notice_table(self, search_category, page) -> list[PageElement]:
response = requests.get(URLs[search_category] + '&page=' + str(page))
soup = BeautifulSoup(response.text, 'html.parser')
return list(soup.select('tbody tr:not(.bo_notice) td.td_subject div.bo_tit a'))
def __get_notice_data(self, notice: PageElement) -> Notice:
link = notice.get('href')
num = int(link.split('wr_id')[-1].split('&')[0].replace('=', ''))
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.select_one('.bo_v_tit').text.strip()
category = CATEGORY_ALIAS[soup.select_one('.bo_v_cate').text]
content = soup.select_one('#bo_v_con').text.strip().replace('\xa0', '')
created_at = '20' + soup.select_one('.if_date').text.replace('작성일 ', '') + ':00'
return Notice(num, link, title, category, content, created_at)
def crawl_notice_from_web(self, search_category: str='전체', amount: int=-1) -> list[Notice]:
"""공지사항을 크롤링하는 함수
Args:
search_category (str, optional): 크롤링할 공지사항의 카테고리. Defaults to '전체'.
amount (int, optional): 크롤링할 공지사항의 개수. Defaults to -1.
Returns:
list[Notice]: 크롤링한 공지사항 리스트
"""
if amount == 0:
return []
notice_list = list()
notice_total_count = self.__parse_notice_total_count()
if amount > notice_total_count or amount == -1:
amount = notice_total_count
pages = amount // MAX_NOTICE_SIZE + 2
for page in range(1, pages):
notice_table = self.__parse_notice_table(search_category, page)
if page == pages - 1:
notice_table = notice_table[:amount % MAX_NOTICE_SIZE]
for notice in notice_table:
notice_list.append(self.__get_notice_data(notice))
return notice_list
def send_notice_to_api(self, url: str, notice_list: list[Notice]) -> int:
"""크롤링한 공지사항을 api로 전송하는 함수
Args:
url (str): api의 url
notice_list (list[Notice]): 전송할 공지사항 리스트
"""
response = requests.post(
url,
json={'data': [notice.__dict__ for notice in notice_list]},
headers={'Content-Type': 'application/json'}
)
return response