diff --git a/app.js b/app.js index fe6e581..1679fa3 100644 --- a/app.js +++ b/app.js @@ -44,10 +44,6 @@ const STEALTH_BROWSING = (process.env.STEALTH_BROWSING || "true").toLowerCase() const MAX_CONCURRENT_CONTEXTS = process.env.MAX_CONCURRENT_CONTEXTS === "Infinity" ? Infinity : parseInt(process.env.MAX_CONCURRENT_CONTEXTS); const CONTEXT_TIMEOUT = parseInt(process.env.CONTEXT_TIMEOUT) || 600000; // 10 minutes -timeoutContext.initTimeoutContext(CONTEXT_TIMEOUT); -limitContext.initContextCounter(MAX_CONCURRENT_CONTEXTS); -loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT); - async function setupBrowser() { try { if (TOKEN_2CAPTCHA) { // If token is given then RecaptchaPlugin is activated @@ -88,14 +84,20 @@ async function setupBrowser() { process.exit(1); } - createPuppeteerMetrics(app); + createPuppeteerMetrics(app); // TODO: to check if we can move it to services initialization part } +// App initialization (async () => { await setupBrowser(); app.set('lock', new AsyncLock()); })(); +// Services initialization +timeoutContext.initTimeoutContext(CONTEXT_TIMEOUT); +limitContext.initContextCounter(app, MAX_CONCURRENT_CONTEXTS); +loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT); + app.use(express.json()); app.use(express.urlencoded({ extended: false })); app.use(middlewares.logHTTPMiddleware()); diff --git a/helpers/limit_context.js b/helpers/limit_context.js index 3bff9e5..fb44ce9 100644 --- a/helpers/limit_context.js +++ b/helpers/limit_context.js @@ -1,3 +1,5 @@ +const { getLogger } = require('../helpers/loggers'); + let contextCounter = 0; function incContextCounter() {} @@ -9,10 +11,19 @@ exports.decContextCounter = decContextCounter; // Empty function or decrementer function canCreateContext() { return true; } exports.canCreateContext = canCreateContext; // Truish function or checker if the context can be created -exports.initContextCounter = function (maxContextCounter) { +exports.initContextCounter = function (app, maxContextCounter) { if (!isNaN(maxContextCounter)) { exports.incContextCounter = () => { contextCounter++ }; exports.decContextCounter = () => { contextCounter-- }; - exports.canCreateContext = () => { return contextCounter < maxContextCounter } + exports.canCreateContext = () => { return contextCounter < maxContextCounter }; + + setInterval(() => { // Synchronize number of contexts every 1 minute + const contextsNumber = app.get('browser').browserContexts().length - 1; // Minus permanent context + + if (contextsNumber !== contextCounter) { + getLogger().warn(`Changing contextCounter from ${contextCounter} to ${contextsNumber} due to synchronization\n`); + contextCounter = contextsNumber; + } + }, 60000); } } diff --git a/helpers/timeout_context.js b/helpers/timeout_context.js index 8fc6e7d..6f78efa 100644 --- a/helpers/timeout_context.js +++ b/helpers/timeout_context.js @@ -1,8 +1,10 @@ const {BrowserContext} = require('puppeteer'); + const loggers = require('./loggers'); +const limitContext = require("./limit_context"); /** - * ContextId -> Timeout timer' IDs + * ContextId -> Timeout timer's IDs * * @type {{string: number}} */ @@ -12,25 +14,36 @@ let contextTimeout; /** * Set timeout for context. * - * @param {BrowserContext} context - */ + * @param {BrowserContext} context Browser context. + **/ function setContextTimeout(context) { const logger = loggers.getLogger(); contextTimeoutIds[context.id] = setTimeout( async () => { - logger.warn(`Closing context ${context.id} due to timeout\n`); - await context.close(); - delete contextTimeoutIds[context.id]; + try { + await context.close(); + limitContext.decContextCounter(); + logger.warn(`Context ${context.id} is closed due to timeout\n`); + } catch (e) { + logger.warn(`Context ${context.id} has fallen off\n`); + logger.error({ + message: e, + contextId: context.id, + }); + } finally { + delete contextTimeoutIds[context.id]; + } }, - contextTimeout); + contextTimeout, + ); } exports.setContextTimeout = setContextTimeout; /** * The function clears context's timeout timer. * - * @param {BrowserContext} context context to be cleared + * @param {BrowserContext} context Context to be cleared */ function clearContextTimeout(context) { clearTimeout(contextTimeoutIds[context.id]); @@ -41,7 +54,7 @@ exports.clearContextTimeout = clearContextTimeout; /** * Update timeout for context. * - * @param {BrowserContext} context + * @param {BrowserContext} context Context. */ exports.updateContextTimeout = function updateContextTimeout (context) { clearContextTimeout(context); @@ -51,7 +64,7 @@ exports.updateContextTimeout = function updateContextTimeout (context) { /** * Init service that timeouts contexts after CONTEXT_TIMEOUT ms. * - * @param {number} timeout + * @param {number} timeout Context timeout for the service. */ exports.initTimeoutContext = function initTimeoutContext (timeout) { contextTimeout = timeout; diff --git a/helpers/utils.js b/helpers/utils.js index 125231f..5922bd8 100644 --- a/helpers/utils.js +++ b/helpers/utils.js @@ -1,9 +1,11 @@ -const exceptions = require("./exceptions"); +const {Browser} = require('puppeteer'); const { proxyRequest } = require('puppeteer-proxy'); -const timeoutContext = require('./timeout_context'); -const limitContext = require('./limit_context'); const PuppeteerHar = require('puppeteer-har'); +const exceptions = require("./exceptions"); +const limitContext = require('./limit_context'); +const timeoutContext = require('./timeout_context'); + const PROXY_URL_KEY = 'puppeteer-service-proxy-url' async function findContextInBrowser(browser, contextId) { @@ -24,6 +26,12 @@ async function findPageInContext(context, pageId) { throw new exceptions.PageNotFoundError(); } +/** + * Close contexts in browser. + * + * @param {Browser} browser Browser with contexts to close. + * @param {[string]} contextIds Context ids to close. + **/ exports.closeContexts = async function closeContexts(browser, contextIds) { // TODO shared locks on contexts and exclusive on pages? const closePromises = []; @@ -71,6 +79,7 @@ async function wait(page, waitFor) { /*** * This function returns `pageId` and `contextId` of corresponding page. + * * @param page * @returns Promise */ @@ -94,10 +103,11 @@ exports.getContents = async function getContents(page, waitFor) { async function newPage(context, request) { const page = await context.newPage(); + if (request.body.harRecording){ - const harWriter = new PuppeteerHar(page) - harWriter.start() - page.harWriter = harWriter + const harWriter = new PuppeteerHar(page); + await harWriter.start(); + page.harWriter = harWriter; } await page.setRequestInterception(true); @@ -125,7 +135,7 @@ async function newContext(browser, options = {}) { const context = await browser.createIncognitoBrowserContext(options); limitContext.incContextCounter(); timeoutContext.setContextTimeout(context); - return context + return context; } catch (err) { limitContext.decContextCounter(); throw err; @@ -141,6 +151,7 @@ function getProxy(request) { /*** * This function returns a page from browser context or create new page or even context if pageId or contextId are * none. If no context or now page found throw an error. + * * @param browser * @param request * @returns {Promise} diff --git a/package.json b/package.json index b122783..2c706e3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "scrapy-puppeteer-service", - "version": "0.3.4", + "version": "0.3.5", "private": true, "scripts": { "start": "node ./bin/www" @@ -25,7 +25,7 @@ "puppeteer-extra-plugin-recaptcha": "^3.6.8", "puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-proxy": "^2.1.2", - "puppeteer-har": "1.1.2", + "puppeteer-har": "^1.1.2", "winston": "^3.11.0", "winston-logstash": "^1.2.1" } diff --git a/routes/health_check.js b/routes/health_check.js index f0e2c8e..40222ac 100644 --- a/routes/health_check.js +++ b/routes/health_check.js @@ -1,4 +1,5 @@ const express = require('express'); + const router = express.Router(); /** diff --git a/yarn.lock b/yarn.lock index d1da744..94d56f3 100644 --- a/yarn.lock +++ b/yarn.lock @@ -431,6 +431,16 @@ chownr@^1.1.1: resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz" integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== +chrome-har@^0.11.3: + version "0.11.12" + resolved "https://registry.yarnpkg.com/chrome-har/-/chrome-har-0.11.12.tgz#29a75a0d9ebb70c9c40d8fbd35c3db7d4f010e25" + integrity sha512-Fi/YCoUHjQMQC0sPKCdiuGVbApeEwIUNvISrlwZgbuUcxfHJA6MjD4RsIH/YSOAo/Z3ENiF+xaEpsdqqdETIjg== + dependencies: + dayjs "1.8.31" + debug "4.1.1" + tough-cookie "4.0.0" + uuid "8.0.0" + chromium-bidi@0.4.7: version "0.4.7" resolved "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.4.7.tgz" @@ -594,6 +604,11 @@ cross-spawn@^6.0.5: shebang-command "^1.2.0" which "^1.2.9" +dayjs@1.8.31: + version "1.8.31" + resolved "https://registry.yarnpkg.com/dayjs/-/dayjs-1.8.31.tgz#0cd1114c2539dd5ad9428be0c38df6d4bb40b9d3" + integrity sha512-mPh1mslned+5PuIuiUfbw4CikHk6AEAf2Baxih+wP5fssv+wmlVhvgZ7mq+BhLt7Sr/Hc8leWDiwe6YnrpNt3g== + debug@2.6.9: version "2.6.9" resolved "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz" @@ -608,6 +623,13 @@ debug@4, debug@4.3.4, debug@^4.1.1, debug@~4.3.4: dependencies: ms "2.1.2" +debug@4.1.1: + version "4.1.1" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.1.1.tgz#3b72260255109c6b589cee050f1d516139664791" + integrity sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw== + dependencies: + ms "^2.1.1" + decompress-response@^6.0.0: version "6.0.0" resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz" @@ -1843,6 +1865,13 @@ puppeteer-extra@^3.3.6: debug "^4.1.1" deepmerge "^4.2.2" +puppeteer-har@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/puppeteer-har/-/puppeteer-har-1.1.2.tgz#f78e832118ee083ab86bf3e6b73c6642d9e5325f" + integrity sha512-Z5zfoj8RkhUT9UbrrR8JjOHNnCr7sNINoeR346F40sLo/4zn+KX/sw/eoKNrtsISc1s/2YCZaqaSEVx6cZ8NQg== + dependencies: + chrome-har "^0.11.3" + puppeteer-proxy@^2.1.2: version "2.1.2" resolved "https://registry.npmjs.org/puppeteer-proxy/-/puppeteer-proxy-2.1.2.tgz" @@ -2249,7 +2278,7 @@ toidentifier@1.0.1: resolved "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz" integrity sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA== -tough-cookie@^4.0.0: +tough-cookie@4.0.0, tough-cookie@^4.0.0: version "4.0.0" resolved "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz" integrity sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg== @@ -2337,6 +2366,11 @@ utils-merge@1.0.1: resolved "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz" integrity sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM= +uuid@8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/uuid/-/uuid-8.0.0.tgz#bc6ccf91b5ff0ac07bbcdbf1c7c4e150db4dbb6c" + integrity sha512-jOXGuXZAWdsTH7eZLtyXMqUb9EcWMGZNbL9YcGBJl4MH4nrxHmZJhEHvyLFrkxo+28uLb/NYRcStH48fnD0Vzw== + vali-date@^1.0.0: version "1.0.0" resolved "https://registry.npmjs.org/vali-date/-/vali-date-1.0.0.tgz"