Skip to content

Commit

Permalink
Fix context counters (#51)
Browse files Browse the repository at this point in the history
* Update context counter when timeout context

* Synchronisation of ContextCounters

* Fixed bug when `Changing contextCounter from -4 to 0 due to synchronization` due to context falling off.

* Structure changes

* Logging error and version++
  • Loading branch information
MatthewZMSU authored Aug 22, 2024
1 parent 0d891f9 commit 581e3be
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 27 deletions.
12 changes: 7 additions & 5 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ const STEALTH_BROWSING = (process.env.STEALTH_BROWSING || "true").toLowerCase()
const MAX_CONCURRENT_CONTEXTS = process.env.MAX_CONCURRENT_CONTEXTS === "Infinity" ? Infinity : parseInt(process.env.MAX_CONCURRENT_CONTEXTS);
const CONTEXT_TIMEOUT = parseInt(process.env.CONTEXT_TIMEOUT) || 600000; // 10 minutes

timeoutContext.initTimeoutContext(CONTEXT_TIMEOUT);
limitContext.initContextCounter(MAX_CONCURRENT_CONTEXTS);
loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT);

async function setupBrowser() {
try {
if (TOKEN_2CAPTCHA) { // If token is given then RecaptchaPlugin is activated
Expand Down Expand Up @@ -88,14 +84,20 @@ async function setupBrowser() {
process.exit(1);
}

createPuppeteerMetrics(app);
createPuppeteerMetrics(app); // TODO: to check if we can move it to services initialization part
}

// App initialization
(async () => {
await setupBrowser();
app.set('lock', new AsyncLock());
})();

// Services initialization
timeoutContext.initTimeoutContext(CONTEXT_TIMEOUT);
limitContext.initContextCounter(app, MAX_CONCURRENT_CONTEXTS);
loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT);

app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(middlewares.logHTTPMiddleware());
Expand Down
15 changes: 13 additions & 2 deletions helpers/limit_context.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const { getLogger } = require('../helpers/loggers');

let contextCounter = 0;

function incContextCounter() {}
Expand All @@ -9,10 +11,19 @@ exports.decContextCounter = decContextCounter; // Empty function or decrementer
function canCreateContext() { return true; }
exports.canCreateContext = canCreateContext; // Truish function or checker if the context can be created

exports.initContextCounter = function (maxContextCounter) {
exports.initContextCounter = function (app, maxContextCounter) {
if (!isNaN(maxContextCounter)) {
exports.incContextCounter = () => { contextCounter++ };
exports.decContextCounter = () => { contextCounter-- };
exports.canCreateContext = () => { return contextCounter < maxContextCounter }
exports.canCreateContext = () => { return contextCounter < maxContextCounter };

setInterval(() => { // Synchronize number of contexts every 1 minute
const contextsNumber = app.get('browser').browserContexts().length - 1; // Minus permanent context

if (contextsNumber !== contextCounter) {
getLogger().warn(`Changing contextCounter from ${contextCounter} to ${contextsNumber} due to synchronization\n`);
contextCounter = contextsNumber;
}
}, 60000);
}
}
33 changes: 23 additions & 10 deletions helpers/timeout_context.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
const {BrowserContext} = require('puppeteer');

const loggers = require('./loggers');
const limitContext = require("./limit_context");

/**
* ContextId -> Timeout timer' IDs
* ContextId -> Timeout timer's IDs
*
* @type {{string: number}}
*/
Expand All @@ -12,25 +14,36 @@ let contextTimeout;
/**
* Set timeout for context.
*
* @param {BrowserContext} context
*/
* @param {BrowserContext} context Browser context.
**/
function setContextTimeout(context) {
const logger = loggers.getLogger();

contextTimeoutIds[context.id] = setTimeout(
async () => {
logger.warn(`Closing context ${context.id} due to timeout\n`);
await context.close();
delete contextTimeoutIds[context.id];
try {
await context.close();
limitContext.decContextCounter();
logger.warn(`Context ${context.id} is closed due to timeout\n`);
} catch (e) {
logger.warn(`Context ${context.id} has fallen off\n`);
logger.error({
message: e,
contextId: context.id,
});
} finally {
delete contextTimeoutIds[context.id];
}
},
contextTimeout);
contextTimeout,
);
}
exports.setContextTimeout = setContextTimeout;

/**
* The function clears context's timeout timer.
*
* @param {BrowserContext} context context to be cleared
* @param {BrowserContext} context Context to be cleared
*/
function clearContextTimeout(context) {
clearTimeout(contextTimeoutIds[context.id]);
Expand All @@ -41,7 +54,7 @@ exports.clearContextTimeout = clearContextTimeout;
/**
* Update timeout for context.
*
* @param {BrowserContext} context
* @param {BrowserContext} context Context.
*/
exports.updateContextTimeout = function updateContextTimeout (context) {
clearContextTimeout(context);
Expand All @@ -51,7 +64,7 @@ exports.updateContextTimeout = function updateContextTimeout (context) {
/**
* Init service that timeouts contexts after CONTEXT_TIMEOUT ms.
*
* @param {number} timeout
* @param {number} timeout Context timeout for the service.
*/
exports.initTimeoutContext = function initTimeoutContext (timeout) {
contextTimeout = timeout;
Expand Down
25 changes: 18 additions & 7 deletions helpers/utils.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
const exceptions = require("./exceptions");
const {Browser} = require('puppeteer');
const { proxyRequest } = require('puppeteer-proxy');
const timeoutContext = require('./timeout_context');
const limitContext = require('./limit_context');
const PuppeteerHar = require('puppeteer-har');

const exceptions = require("./exceptions");
const limitContext = require('./limit_context');
const timeoutContext = require('./timeout_context');

const PROXY_URL_KEY = 'puppeteer-service-proxy-url'

async function findContextInBrowser(browser, contextId) {
Expand All @@ -24,6 +26,12 @@ async function findPageInContext(context, pageId) {
throw new exceptions.PageNotFoundError();
}

/**
* Close contexts in browser.
*
* @param {Browser} browser Browser with contexts to close.
* @param {[string]} contextIds Context ids to close.
**/
exports.closeContexts = async function closeContexts(browser, contextIds) {
// TODO shared locks on contexts and exclusive on pages?
const closePromises = [];
Expand Down Expand Up @@ -71,6 +79,7 @@ async function wait(page, waitFor) {

/***
* This function returns `pageId` and `contextId` of corresponding page.
*
* @param page
* @returns Promise
*/
Expand All @@ -94,10 +103,11 @@ exports.getContents = async function getContents(page, waitFor) {

async function newPage(context, request) {
const page = await context.newPage();

if (request.body.harRecording){
const harWriter = new PuppeteerHar(page)
harWriter.start()
page.harWriter = harWriter
const harWriter = new PuppeteerHar(page);
await harWriter.start();
page.harWriter = harWriter;
}

await page.setRequestInterception(true);
Expand Down Expand Up @@ -125,7 +135,7 @@ async function newContext(browser, options = {}) {
const context = await browser.createIncognitoBrowserContext(options);
limitContext.incContextCounter();
timeoutContext.setContextTimeout(context);
return context
return context;
} catch (err) {
limitContext.decContextCounter();
throw err;
Expand All @@ -141,6 +151,7 @@ function getProxy(request) {
/***
* This function returns a page from browser context or create new page or even context if pageId or contextId are
* none. If no context or now page found throw an error.
*
* @param browser
* @param request
* @returns {Promise<Page>}
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "scrapy-puppeteer-service",
"version": "0.3.4",
"version": "0.3.5",
"private": true,
"scripts": {
"start": "node ./bin/www"
Expand All @@ -25,7 +25,7 @@
"puppeteer-extra-plugin-recaptcha": "^3.6.8",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-proxy": "^2.1.2",
"puppeteer-har": "1.1.2",
"puppeteer-har": "^1.1.2",
"winston": "^3.11.0",
"winston-logstash": "^1.2.1"
}
Expand Down
1 change: 1 addition & 0 deletions routes/health_check.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const express = require('express');

const router = express.Router();

/**
Expand Down
36 changes: 35 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,16 @@ chownr@^1.1.1:
resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz"
integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==

chrome-har@^0.11.3:
version "0.11.12"
resolved "https://registry.yarnpkg.com/chrome-har/-/chrome-har-0.11.12.tgz#29a75a0d9ebb70c9c40d8fbd35c3db7d4f010e25"
integrity sha512-Fi/YCoUHjQMQC0sPKCdiuGVbApeEwIUNvISrlwZgbuUcxfHJA6MjD4RsIH/YSOAo/Z3ENiF+xaEpsdqqdETIjg==
dependencies:
dayjs "1.8.31"
debug "4.1.1"
tough-cookie "4.0.0"
uuid "8.0.0"

[email protected]:
version "0.4.7"
resolved "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-0.4.7.tgz"
Expand Down Expand Up @@ -594,6 +604,11 @@ cross-spawn@^6.0.5:
shebang-command "^1.2.0"
which "^1.2.9"

[email protected]:
version "1.8.31"
resolved "https://registry.yarnpkg.com/dayjs/-/dayjs-1.8.31.tgz#0cd1114c2539dd5ad9428be0c38df6d4bb40b9d3"
integrity sha512-mPh1mslned+5PuIuiUfbw4CikHk6AEAf2Baxih+wP5fssv+wmlVhvgZ7mq+BhLt7Sr/Hc8leWDiwe6YnrpNt3g==

[email protected]:
version "2.6.9"
resolved "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz"
Expand All @@ -608,6 +623,13 @@ debug@4, [email protected], debug@^4.1.1, debug@~4.3.4:
dependencies:
ms "2.1.2"

[email protected]:
version "4.1.1"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.1.1.tgz#3b72260255109c6b589cee050f1d516139664791"
integrity sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==
dependencies:
ms "^2.1.1"

decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz"
Expand Down Expand Up @@ -1843,6 +1865,13 @@ puppeteer-extra@^3.3.6:
debug "^4.1.1"
deepmerge "^4.2.2"

puppeteer-har@^1.1.2:
version "1.1.2"
resolved "https://registry.yarnpkg.com/puppeteer-har/-/puppeteer-har-1.1.2.tgz#f78e832118ee083ab86bf3e6b73c6642d9e5325f"
integrity sha512-Z5zfoj8RkhUT9UbrrR8JjOHNnCr7sNINoeR346F40sLo/4zn+KX/sw/eoKNrtsISc1s/2YCZaqaSEVx6cZ8NQg==
dependencies:
chrome-har "^0.11.3"

puppeteer-proxy@^2.1.2:
version "2.1.2"
resolved "https://registry.npmjs.org/puppeteer-proxy/-/puppeteer-proxy-2.1.2.tgz"
Expand Down Expand Up @@ -2249,7 +2278,7 @@ [email protected]:
resolved "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz"
integrity sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==

tough-cookie@^4.0.0:
tough-cookie@4.0.0, tough-cookie@^4.0.0:
version "4.0.0"
resolved "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz"
integrity sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg==
Expand Down Expand Up @@ -2337,6 +2366,11 @@ [email protected]:
resolved "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz"
integrity sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM=

[email protected]:
version "8.0.0"
resolved "https://registry.yarnpkg.com/uuid/-/uuid-8.0.0.tgz#bc6ccf91b5ff0ac07bbcdbf1c7c4e150db4dbb6c"
integrity sha512-jOXGuXZAWdsTH7eZLtyXMqUb9EcWMGZNbL9YcGBJl4MH4nrxHmZJhEHvyLFrkxo+28uLb/NYRcStH48fnD0Vzw==

vali-date@^1.0.0:
version "1.0.0"
resolved "https://registry.npmjs.org/vali-date/-/vali-date-1.0.0.tgz"
Expand Down

0 comments on commit 581e3be

Please sign in to comment.