Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/examples/code/playwright_crawler_with_camoufox.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ async def new_browser(self) -> PlaywrightBrowserController:
raise RuntimeError('Playwright browser plugin is not initialized.')

return PlaywrightBrowserController(
browser=await AsyncNewBrowser(self._playwright, headless=True, **self._browser_options),
browser=await AsyncNewBrowser(self._playwright, headless=True, **self._browser_launch_options),
max_open_pages_per_browser=1, # Increase, if camoufox can handle it in your use case.
header_generator=None, # This turns off the crawlee header_generation. Camoufox has its own.
)
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/browsers/_base_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ def browser_type(self) -> BrowserType:
@abstractmethod
async def new_page(
self,
page_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
proxy_info: ProxyInfo | None = None,
) -> Page:
"""Create a new page with the given context options.

Args:
page_options: Options to configure the new page.
browser_new_context_options: Options to configure the new context used for creating new page.
Comment thread
vdusek marked this conversation as resolved.
Outdated
proxy_info: The proxy configuration to use for the new page.

Returns:
Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/browsers/_base_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ def browser_type(self) -> BrowserType:

@property
@abstractmethod
def browser_options(self) -> Mapping[str, Any]:
"""Return the options for a new browser."""
def browser_launch_options(self) -> Mapping[str, Any]:
Comment thread
vdusek marked this conversation as resolved.
"""Return the options for the `browser.launch` method."""

@property
@abstractmethod
def page_options(self) -> Mapping[str, Any]:
"""Return the options for a new page."""
def browser_new_context_options(self) -> Mapping[str, Any]:
Comment thread
vdusek marked this conversation as resolved.
"""Return the options for the `browser.new_context` method."""

@property
@abstractmethod
Expand Down
24 changes: 13 additions & 11 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,30 +100,30 @@ def with_default_plugin(
cls,
*,
browser_type: BrowserType | None = None,
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.

Args:
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
page_options: Keyword arguments to pass to the page object is set at the playwright context level.
browser_new_context_options: Keyword arguments to pass to the page object is set at the playwright context.
These options are provided directly to Playwright's `browser.new_context` method. For more details,
refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
Comment thread
vdusek marked this conversation as resolved.
Outdated
headless: Whether to run the browser in headless mode.
kwargs: Additional arguments for default constructor.
"""
plugin_options: dict = defaultdict(dict)
plugin_options['browser_options'] = browser_options or {}
plugin_options['page_options'] = page_options or {}
plugin_options['browser_launch_options'] = browser_launch_options or {}
plugin_options['browser_new_context_options'] = browser_new_context_options or {}

if headless is not None:
plugin_options['browser_options']['headless'] = headless
plugin_options['browser_launch_options']['headless'] = headless

if browser_type:
plugin_options['browser_type'] = browser_type
Expand Down Expand Up @@ -262,13 +262,15 @@ async def _get_new_page(
) -> CrawleePage:
"""Internal method to initialize a new page in a browser using the specified plugin."""
timeout = self._operation_timeout.total_seconds()
browser = self._pick_browser_with_free_capacity(plugin)
browser_controller = self._pick_browser_with_free_capacity(plugin)
Comment thread
vdusek marked this conversation as resolved.

try:
if not browser:
browser = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
if not browser_controller:
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
page = await asyncio.wait_for(
browser.new_page(page_options=plugin.page_options, proxy_info=proxy_info),
browser_controller.new_page(
browser_new_context_options=plugin.browser_new_context_options, proxy_info=proxy_info
),
timeout,
)
except asyncio.TimeoutError as exc:
Expand Down
12 changes: 5 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ def browser_type(self) -> BrowserType:
@override
async def new_page(
self,
page_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
proxy_info: ProxyInfo | None = None,
) -> Page:
if not self._browser_context:
self._browser_context = await self._create_browser_context(page_options, proxy_info)
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')
Expand All @@ -116,13 +116,11 @@ async def new_page(

@override
async def close(self, *, force: bool = False) -> None:
if force:
for page in self._pages:
await page.close()

if self.pages_count > 0:
if self.pages_count > 0 and not force:
raise ValueError('Cannot close the browser while there are open pages.')

if self._browser_context:
await self._browser_context.close()
Comment thread
vdusek marked this conversation as resolved.
await self._browser.close()

def _on_page_close(self, page: Page) -> None:
Expand Down
26 changes: 13 additions & 13 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,26 @@ def __init__(
self,
*,
browser_type: BrowserType = 'chromium',
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
) -> None:
"""A default constructor.

Args:
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
page_options: Keyword arguments to pass to the page object is set at the playwright context level.
browser_new_context_options: Keyword arguments to pass to the page object is set at the playwright context.
These options are provided directly to Playwright's `browser.new_context` method. For more details,
refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
Once reached, a new browser instance will be launched to handle the excess.
Comment thread
vdusek marked this conversation as resolved.
"""
self._browser_type = browser_type
self._browser_options = browser_options or {}
self._page_options = page_options or {}
self._browser_launch_options = browser_launch_options or {}
self._browser_new_context_options = browser_new_context_options or {}
self._max_open_pages_per_browser = max_open_pages_per_browser

self._playwright_context_manager = async_playwright()
Expand All @@ -75,13 +75,13 @@ def browser_type(self) -> BrowserType:

@property
@override
def browser_options(self) -> Mapping[str, Any]:
return self._browser_options
def browser_launch_options(self) -> Mapping[str, Any]:
return self._browser_launch_options

@property
@override
def page_options(self) -> Mapping[str, Any]:
return self._page_options
def browser_new_context_options(self) -> Mapping[str, Any]:
return self._browser_new_context_options

@property
@override
Expand Down Expand Up @@ -117,11 +117,11 @@ async def new_browser(self) -> PlaywrightBrowserController:
raise RuntimeError('Playwright browser plugin is not initialized.')

if self._browser_type == 'chromium':
browser = await self._playwright.chromium.launch(**self._browser_options)
browser = await self._playwright.chromium.launch(**self._browser_launch_options)
elif self._browser_type == 'firefox':
browser = await self._playwright.firefox.launch(**self._browser_options)
browser = await self._playwright.firefox.launch(**self._browser_launch_options)
elif self._browser_type == 'webkit':
browser = await self._playwright.webkit.launch(**self._browser_options)
browser = await self._playwright.webkit.launch(**self._browser_launch_options)
else:
raise ValueError(f'Invalid browser type: {self._browser_type}')

Expand Down
25 changes: 14 additions & 11 deletions src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def __init__(
self,
browser_pool: BrowserPool | None = None,
browser_type: BrowserType | None = None,
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
Expand All @@ -82,33 +82,36 @@ def __init__(
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
This option should not be used if `browser_pool` is provided.
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
This option should not be used if `browser_pool` is provided.
page_options: Keyword arguments to pass to the new page method. These options are provided directly to
Playwright's `browser_context.new_page` method. For more details, refer to the Playwright documentation:
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page.
browser_new_context_options: Keyword arguments to pass to the page object is set at the playwright context.
These options are provided directly to Playwright's `browser.new_context` method. For more details,
refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
Comment thread
Pijukatel marked this conversation as resolved.
Outdated
This option should not be used if `browser_pool` is provided.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
"""
if browser_pool:
# Raise an exception if browser_pool is provided together with other browser-related arguments.
if any(param is not None for param in (headless, browser_type, browser_options, page_options)):
if any(
param is not None
for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
):
raise ValueError(
'You cannot provide `headless`, `browser_type`, `browser_options` or `page_options` '
'arguments when `browser_pool` is provided.'
'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
'`browser_new_context_options` arguments when `browser_pool` is provided.'
)

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
browser_pool = BrowserPool.with_default_plugin(
headless=headless,
browser_type=browser_type,
browser_options=browser_options,
page_options=page_options,
browser_launch_options=browser_launch_options,
browser_new_context_options=browser_new_context_options,
)

self._browser_pool = browser_pool
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/browsers/test_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ async def test_methods_raise_error_when_not_active() -> None:


async def test_with_plugin_contains_page_options(httpbin: URL) -> None:
plugin = PlaywrightBrowserPlugin(page_options={'user_agent': 'My Best User-Agent'})
plugin = PlaywrightBrowserPlugin(browser_new_context_options={'user_agent': 'My Best User-Agent'})
async with BrowserPool(plugins=[plugin]) as browser_pool:
test_page = await browser_pool.new_page()
await test_page.page.goto(str(httpbin / 'user-agent'))
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/browsers/test_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ async def plugin() -> AsyncGenerator[PlaywrightBrowserPlugin, None]:
async def test_initial_state() -> None:
plugin = PlaywrightBrowserPlugin(
browser_type='chromium',
browser_options={'headless': False},
page_options={'viewport': {'width': 1920, 'height': 1080}},
browser_launch_options={'headless': False},
browser_new_context_options={'viewport': {'width': 1920, 'height': 1080}},
max_open_pages_per_browser=10,
)

# Test initial state
assert plugin.browser_type == 'chromium'
assert plugin.browser_options == {'headless': False}
assert plugin.page_options == {'viewport': {'width': 1920, 'height': 1080}}
assert plugin.browser_launch_options == {'headless': False}
assert plugin.browser_new_context_options == {'viewport': {'width': 1920, 'height': 1080}}
assert plugin.max_open_pages_per_browser == 10


Expand Down