Skip to content

Commit

Permalink
增加笔趣阁书源
Browse files Browse the repository at this point in the history
  • Loading branch information
x201206030 committed May 18, 2020
1 parent 231b94f commit 5e16119
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ public class CrawlParser {

private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");

private static ThreadLocal <Integer> retryCount = new ThreadLocal<>();

@SneakyThrows
public static Book parseBook(RuleBean ruleBean, String bookId) {
Book book = new Book();
Expand Down Expand Up @@ -148,10 +150,12 @@ public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, B
//读取目录
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
String indexListHtml = getByHttpClient(indexListUrl);
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
}

if (indexListHtml != null) {
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
}

Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten());
Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml);

Expand All @@ -177,7 +181,7 @@ public static Map<Integer, List> parseBookIndexAndContent(String sourceBookId, B

//查询章节内容
String contentHtml = getByHttpClient(contentUrl);
if (contentHtml != null) {
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
//TODO插入章节目录和章节内容
Expand Down Expand Up @@ -259,17 +263,30 @@ private static String getByHttpClient(String url) {
log.debug("body长度:"+body.length());
if(body.length() < Constants.INVALID_HTML_LENGTH){
log.debug("获取html页面内容失败");
Thread.sleep( new Random().nextInt(10*1000));
return getByHttpClient(url);
return processErrorHttpResult(url);
}
//成功获得html内容
return body;
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return processErrorHttpResult(url);

}

@SneakyThrows
private static String processErrorHttpResult(String url){
Integer count = retryCount.get();
if(count == null){
count = 0;
}
if(count < Constants.HTTP_FAIL_RETRY_COUNT){
Thread.sleep( new Random().nextInt(10*1000));
retryCount.set(++count);
return getByHttpClient(url);
}
return null;
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ public class Constants {
* 爬取小说http请求中无效的内容长度
*/
public static final int INVALID_HTML_LENGTH = 1500;

/**
* 爬取小说http请求失败重试次数
*/
public static final Integer HTTP_FAIL_RETRY_COUNT = 5;
}
1 change: 1 addition & 0 deletions sql/20200518.sql
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"<a\\\\s+href=\\\"/s/\\\\d+\\\\.html\\\"><h2>([^/]+)</h2></a>\",\r\n \"authorNamePatten\": \"<p>作者:([^/]+)</p>\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^/]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"<div class=\\\"intro_info\\\">\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"<dd><a\\\\s+href=\\\"(\\\\d+)\\\\.html\\\">[^/]+</a></dd>\",\r\n \"indexNamePatten\": \"<dd><a\\\\s+href=\\\"\\\\d+\\\\.html\\\">([^/]+)</a></dd>\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"<div id=\\\"content\\\" class=\\\"showtxt\\\">\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34');
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"<span\\\\s+class=\\\"title\\\">([^/]+)</span>\",\"authorNamePatten\":\"<a\\\\s+href=\\\"/author/\\\\d+/\\\">([^/]+)</a>\",\"picUrlPatten\":\"<img\\\\s+src=\\\"([^>]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"<p class=\\\"review\\\">\",\"descEnd\":\"</p>\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\"indexNamePatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"</p>\",\"contentEnd\":\"<div align=\\\"center\\\">\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');

0 comments on commit 5e16119

Please sign in to comment.