Skip to content

Commit

Permalink
PlayWrightBrowserManager is now ready!
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewZMSU committed Nov 18, 2024
1 parent 3b0a315 commit b113c69
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 36 deletions.
30 changes: 22 additions & 8 deletions scrapypuppeteer/browser_managers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,31 @@
from typing import Union

from scrapy import Request
from scrapy.utils.defer import deferred_from_coro
from twisted.internet.defer import Deferred


class BrowserManager(ABC):
@abstractmethod
def download_request(self, request: Request, spider) -> Union[Coroutine, Request]:
def _download_request(self, request: Request, spider) -> Union[Coroutine, Request]:
...

# @abstractmethod
# def close_used_contexts(self):
# ...
#
# @abstractmethod
# def process_response(self, middleware, request, response, spider):
# ...
@abstractmethod
async def _start_browser_manager(self) -> None:
...

@abstractmethod
async def _stop_browser_manager(self) -> None:
...

def download_request(self, request: Request, spider) -> Union[Deferred, Request]:
coro_or_request = self._download_request(request, spider)
if isinstance(coro_or_request, Coroutine):
return deferred_from_coro(coro_or_request)
return coro_or_request

def start_browser_manager(self) -> Deferred:
return deferred_from_coro(self._start_browser_manager())

def stop_browser_manager(self) -> Deferred:
return deferred_from_coro(self._stop_browser_manager())
23 changes: 12 additions & 11 deletions scrapypuppeteer/browser_managers/browser_downloader_handler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from collections.abc import Coroutine

from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.reactor import verify_installed_reactor
from scrapy import signals
from twisted.internet.defer import Deferred

from scrapypuppeteer import CloseContextRequest
from scrapypuppeteer.browser_managers import BrowserManager
from scrapypuppeteer.browser_managers.playwright_browser_manager import PlaywrightBrowserManager
from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager
# from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager
from scrapypuppeteer.browser_managers.service_browser_manager import ServiceBrowserManager
from scrapypuppeteer.request import ActionRequest

Expand Down Expand Up @@ -38,18 +36,21 @@ def from_crawler(cls, crawler: Crawler):
match execution_method:
case "puppeteer":
browser_manager = ServiceBrowserManager()
case "pyppeteer":
browser_manager = PyppeteerBrowserManager()
# case "pyppeteer":
# browser_manager = PyppeteerBrowserManager()
case "playwright":
browser_manager = PlaywrightBrowserManager()
case _:
raise ValueError(f"Invalid execution method: {execution_method.upper()}")

return cls(settings, browser_manager, crawler=crawler)
bdh = cls(settings, browser_manager, crawler=crawler)
crawler.signals.connect(bdh.browser_manager.start_browser_manager, signals.engine_started)
crawler.signals.connect(bdh.browser_manager.stop_browser_manager, signals.engine_stopped)
return bdh

def download_request(self, request, spider):
if isinstance(request, (ActionRequest, CloseContextRequest)):
coro_or_request = self.browser_manager.download_request(request, spider)
if isinstance(coro_or_request, Coroutine):
return deferred_from_coro(coro_or_request)
dfd_or_request = self.browser_manager.download_request(request, spider)
if isinstance(dfd_or_request, Deferred):
return dfd_or_request
return super().download_request(request, spider)
40 changes: 25 additions & 15 deletions scrapypuppeteer/browser_managers/playwright_browser_manager.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import asyncio
import base64
import uuid
from typing import Dict, Callable, Coroutine
from typing import Dict, Callable, Awaitable, Union

import syncer
from playwright.async_api import async_playwright, Browser
from scrapy.http import TextResponse

from scrapypuppeteer import PuppeteerResponse, PuppeteerRequest
from scrapypuppeteer.browser_managers import BrowserManager
Expand Down Expand Up @@ -54,10 +55,10 @@ async def close_browser(self):
if self.browser:
await self.browser.close()

def close_contexts(self, request: CloseContextRequest):
async def close_contexts(self, request: CloseContextRequest):
for context_id in request.contexts:
if context_id in self.contexts:
syncer.sync(self.contexts[context_id].close())
await self.contexts[context_id].close()
page_id = self.context_page_map.get(context_id)
self.pages.pop(page_id, None)

Expand All @@ -66,9 +67,9 @@ def close_contexts(self, request: CloseContextRequest):


class PlaywrightBrowserManager(BrowserManager):
def __init__(self, context_manager: ContextManager):
self.context_manager = context_manager
self.action_map: Dict[str, Callable[[ActionRequest], Coroutine[PuppeteerResponse]]] = {
def __init__(self):
self.context_manager: Union[ContextManager, None] = None # Will be initialized later
self.action_map: Dict[str, Callable[[ActionRequest], Awaitable[PuppeteerResponse]]] = {
"goto": self.goto,
"click": self.click,
"compose": self.compose,
Expand All @@ -82,12 +83,14 @@ def __init__(self, context_manager: ContextManager):
"fill_form": self.fill_form,
}

@classmethod
async def async_init(cls):
context_manager = await ContextManager.async_init()
return cls(context_manager)
async def _start_browser_manager(self) -> None:
self.context_manager = await ContextManager.async_init()

async def _stop_browser_manager(self) -> None:
if self.context_manager:
await self.context_manager.close_browser()

def download_request(self, request, spider):
def _download_request(self, request, spider):
if isinstance(request, ActionRequest):
endpoint = request.action.endpoint
action_function = self.action_map.get(endpoint)
Expand All @@ -97,11 +100,18 @@ def download_request(self, request, spider):
if isinstance(request, CloseContextRequest):
return self.close_contexts(request)

def close_contexts(self, request: CloseContextRequest):
self.context_manager.close_contexts(request)
async def close_contexts(self, request: CloseContextRequest) -> TextResponse:
await self.context_manager.close_contexts(request)
return TextResponse(
request.url,
encoding="utf-8",
status=200,
headers={},
body=b"Successfully closed context",
)

def close_used_contexts(self):
self.context_manager.close_browser()
async def close_used_contexts(self):
await self.context_manager.close_browser()

def map_navigation_options(self, navigation_options):
if not navigation_options:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self):
"fill_form": self.fill_form,
}

def download_request(self, request, spider):
def _download_request(self, request, spider):
if isinstance(request, PuppeteerRequest):
endpoint = request.action.endpoint
action_function = self.action_map.get(endpoint)
Expand Down
8 changes: 7 additions & 1 deletion scrapypuppeteer/browser_managers/service_browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,11 @@ class ServiceBrowserManager(BrowserManager):
def __init__(self):
super().__init__()

def download_request(self, request, spider):
def _download_request(self, request, spider):
return request

def _start_browser_manager(self) -> None:
return

def _stop_browser_manager(self) -> None:
return

0 comments on commit b113c69

Please sign in to comment.