How to Give a CrewAI Agent Browser Tools
Add real browser capabilities to a CrewAI agent — take screenshots, generate PDFs, inspect pages, and run multi-step sequences — using PageBolt as a CrewAI tool.
CrewAI agents collaborate in crews — researcher, writer, QA, analyst. What none of them can do by default is actually look at a web page, take a screenshot, or verify that a UI element exists.
Here's how to add browser tools to any CrewAI agent using the PageBolt API.
Install
pip install crewai crewai-tools requests
Define the tools
CrewAI tools extend BaseTool and implement a _run method. One class per capability:
import os
import base64
import requests
from crewai.tools import BaseTool
from pydantic import Field
from typing import Optional
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
BASE_URL = "https://pagebolt.dev/api/v1"
HEADERS = {"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"}
class ScreenshotTool(BaseTool):
name: str = "take_screenshot"
description: str = (
"Take a screenshot of any web page. "
"Input: a full URL (e.g. https://example.com). "
"Returns a confirmation with the image size. "
"Use this to visually inspect a page, check a layout, "
"or verify rendered content."
)
def _run(self, url: str) -> str:
res = requests.post(
f"{BASE_URL}/screenshot",
headers=HEADERS,
json={"url": url, "blockBanners": True, "fullPage": True},
timeout=30,
)
res.raise_for_status()
b64 = base64.b64encode(res.content).decode()
return (
f"Screenshot captured from {url} ({len(res.content):,} bytes). "
f"Base64 preview: data:image/png;base64,{b64[:80]}..."
)
class InspectPageTool(BaseTool):
name: str = "inspect_page"
description: str = (
"Get all interactive elements on a page with their CSS selectors. "
"Input: a full URL. "
"Returns buttons, inputs, links, and forms with unique selectors. "
"Always use this before attempting to automate a page."
)
def _run(self, url: str) -> str:
res = requests.post(
f"{BASE_URL}/inspect",
headers=HEADERS,
json={"url": url},
timeout=30,
)
res.raise_for_status()
data = res.json()
elements = data.get("elements", [])[:30]
lines = [
f"{el.get('tag')}[{el.get('role', '')}] "
f"'{(el.get('text') or '')[:70]}' -> {el.get('selector', '')}"
for el in elements
]
return (
f"Found {len(data.get('elements', []))} elements on {url}:\n"
+ "\n".join(lines)
)
class GeneratePDFTool(BaseTool):
name: str = "generate_pdf"
description: str = (
"Generate a PDF of any web page or HTML content. "
"Input: a full URL. "
"Use for invoices, reports, documentation, or any printable page."
)
def _run(self, url: str) -> str:
res = requests.post(
f"{BASE_URL}/pdf",
headers=HEADERS,
json={"url": url},
timeout=30,
)
res.raise_for_status()
output_path = f"output_{url.split('/')[-1] or 'page'}.pdf"
with open(output_path, "wb") as f:
f.write(res.content)
return f"PDF saved to {output_path} ({len(res.content):,} bytes) from {url}"
class RunSequenceTool(BaseTool):
name: str = "run_browser_sequence"
description: str = (
"Run a multi-step browser automation and screenshot the final state. "
"Input: a JSON string with keys: url (starting page) and steps (list of actions). "
"Each step has: action (navigate/click/fill/wait/screenshot), "
"selector (for click/fill), value (for fill), url (for navigate), ms (for wait). "
"Example: {\"url\": \"https://example.com\", \"steps\": [{\"action\": \"click\", \"selector\": \"#btn\"}, {\"action\": \"screenshot\"}]}"
)
def _run(self, input_json: str) -> str:
import json
try:
data = json.loads(input_json)
except json.JSONDecodeError:
return "Error: input must be valid JSON with 'url' and 'steps' keys"
url = data.get("url")
steps = data.get("steps", [])
# Ensure there's a screenshot step at the end
if not steps or steps[-1].get("action") != "screenshot":
steps.append({"action": "screenshot"})
all_steps = [{"action": "navigate", "url": url}] + steps
res = requests.post(
"https://pagebolt.dev/api/v1/sequence",
headers=HEADERS,
json={"steps": all_steps},
timeout=60,
)
if not res.ok:
return f"Sequence failed: {res.status_code} {res.text[:200]}"
result = res.json()
outputs = result.get("outputs", [])
return (
f"Sequence completed: {len(outputs)} output(s). "
f"Steps executed: {len(all_steps)}. "
f"Final state captured."
)
Build a crew
A research crew that can visually inspect websites before writing reports:
from crewai import Agent, Task, Crew, Process
# Tools
screenshot_tool = ScreenshotTool()
inspect_tool = InspectPageTool()
pdf_tool = GeneratePDFTool()
sequence_tool = RunSequenceTool()
# Agents
web_researcher = Agent(
role="Web Research Specialist",
goal="Gather accurate information from web pages using visual inspection",
backstory=(
"You are a meticulous researcher who takes screenshots to verify claims, "
"inspects page structure before drawing conclusions, and documents findings "
"with visual evidence."
),
tools=[screenshot_tool, inspect_tool],
verbose=True,
)
qa_engineer = Agent(
role="QA Engineer",
goal="Verify that web pages render correctly and interactive elements work as expected",
backstory=(
"You are a QA engineer who checks pages visually and structurally. "
"You inspect pages to find their elements, then verify they look and function correctly."
),
tools=[screenshot_tool, inspect_tool, sequence_tool],
verbose=True,
)
document_specialist = Agent(
role="Documentation Specialist",
goal="Generate PDF documentation and reports from web pages",
backstory=(
"You capture web content as PDFs for compliance, archiving, and distribution."
),
tools=[screenshot_tool, pdf_tool],
verbose=True,
)
# Tasks
research_task = Task(
description=(
"Research the homepage of https://example.com. "
"1. Take a screenshot and describe the visual layout. "
"2. Inspect the page and list all navigation links and CTA buttons with their selectors. "
"3. Summarize what the page is about based on visual and structural inspection."
),
agent=web_researcher,
expected_output=(
"A structured report with: visual description, list of navigation elements "
"with selectors, and a 2-3 sentence summary of page purpose."
),
)
qa_task = Task(
description=(
"QA check https://example.com. "
"1. Inspect the page to find all form inputs and buttons. "
"2. Take a full-page screenshot. "
"3. Report any elements that look broken, missing, or unexpected."
),
agent=qa_engineer,
expected_output=(
"QA report with: list of interactive elements found, screenshot confirmation, "
"and pass/fail assessment of page integrity."
),
)
# Run the crew
crew = Crew(
agents=[web_researcher, qa_engineer],
tasks=[research_task, qa_task],
process=Process.sequential,
verbose=True,
)
result = crew.kickoff()
print(result)
Visual QA crew for deploy checks
from crewai import Agent, Task, Crew, Process
visual_checker = Agent(
role="Visual QA Specialist",
goal="Check that key pages render correctly after a deployment",
backstory="You catch visual regressions and layout issues before users do.",
tools=[screenshot_tool, inspect_tool],
verbose=True,
)
pages_to_check = [
"https://staging.yourapp.com",
"https://staging.yourapp.com/pricing",
"https://staging.yourapp.com/login",
]
tasks = [
Task(
description=(
f"Check {page}: take a screenshot and inspect for missing elements. "
"Report any visual issues, broken layouts, or missing navigation."
),
agent=visual_checker,
expected_output=f"Pass/fail report for {page} with details of any issues found.",
)
for page in pages_to_check
]
crew = Crew(
agents=[visual_checker],
tasks=tasks,
process=Process.sequential,
verbose=True,
)
result = crew.kickoff()
print(result)
Parallel crew with roles
from crewai import Agent, Task, Crew, Process
# Three specialists working in parallel on different aspects
screenshotter = Agent(
role="Visual Capture Specialist",
goal="Take accurate screenshots for documentation and verification",
tools=[screenshot_tool],
verbose=False,
)
inspector = Agent(
role="Page Structure Analyst",
goal="Map all interactive elements on a page for automation and testing",
tools=[inspect_tool],
verbose=False,
)
archivist = Agent(
role="PDF Archivist",
goal="Generate PDF archives of important pages",
tools=[pdf_tool],
verbose=False,
)
url = "https://example.com"
crew = Crew(
agents=[screenshotter, inspector, archivist],
tasks=[
Task(
description=f"Take a full-page screenshot of {url} and describe what you see.",
agent=screenshotter,
expected_output="Visual description of the page.",
),
Task(
description=f"Inspect {url} and list all forms and buttons.",
agent=inspector,
expected_output="Structured list of interactive elements with CSS selectors.",
),
Task(
description=f"Generate a PDF of {url} and confirm it was saved.",
agent=archivist,
expected_output="Confirmation with file path and size.",
),
],
process=Process.parallel,
verbose=True,
)
result = crew.kickoff()
print(result)
The agent framework picture
All three major Python agent frameworks now have working PageBolt integrations:
| Framework | Integration | Best for |
|---|---|---|
| LangChain | langchain-mcp-adapters or manual @tool |
Flexible agent chains, custom logic |
| LlamaIndex | BasicMCPClient + McpToolSpec |
RAG + web capture pipelines |
| CrewAI | BaseTool subclasses |
Multi-agent crews with defined roles |
All three can also use the MCP approach (see How to use PageBolt MCP tools in a LangChain or LlamaIndex agent) if you prefer automatic tool discovery over manual wrappers.
Try it free
100 requests/month, no credit card required. OG images, screenshots, PDFs, and video — one API.
Get API Key — Free