How to Screenshot Every Page in Your Sitemap Automatically
Parse sitemap.xml and capture every URL with controlled concurrency and retry — for visual audits, client deliverables, archiving, or pre-migration snapshots.
Visual audits, site archives, client deliverables, pre-migration snapshots — all require screenshots of every page on a site. Doing it manually takes hours. Doing it with Puppeteer means managing a browser pool and handling concurrency yourself.
Here's a script that parses your sitemap.xml and captures every URL with controlled concurrency and automatic retry.
Basic sitemap crawler
import fs from "fs/promises";
import path from "path";
const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY;
const SITEMAP_URL = process.env.SITEMAP_URL || "https://yoursite.com/sitemap.xml";
const OUTPUT_DIR = "screenshots";
const CONCURRENCY = 3; // parallel requests
const DELAY_MS = 500; // between batches
async function fetchSitemap(url) {
const res = await fetch(url);
const xml = await res.text();
// Extract all <loc> URLs
const urls = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map((m) => m[1].trim());
// Handle sitemap index (sitemap of sitemaps)
if (xml.includes("<sitemapindex")) {
const nested = await Promise.all(urls.map(fetchSitemap));
return nested.flat();
}
return urls;
}
async function screenshot(url) {
const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
method: "POST",
headers: {
"x-api-key": PAGEBOLT_API_KEY,
"Content-Type": "application/json",
},
body: JSON.stringify({
url,
fullPage: true,
blockBanners: true,
blockAds: true,
blockTrackers: true,
}),
});
if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
return Buffer.from(await res.arrayBuffer());
}
function urlToFilename(url) {
const { pathname } = new URL(url);
const clean = pathname.replace(/^\/|\/$/g, "").replace(/\//g, "__") || "index";
return `${clean}.png`;
}
async function processBatch(urls) {
return Promise.allSettled(
urls.map(async (url) => {
const filename = urlToFilename(url);
const filepath = path.join(OUTPUT_DIR, filename);
try {
const image = await screenshot(url);
await fs.writeFile(filepath, image);
console.log(`✓ ${url} → ${filename}`);
return { url, filename, ok: true };
} catch (err) {
console.error(`✗ ${url}: ${err.message}`);
return { url, filename, ok: false, error: err.message };
}
})
);
}
async function main() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
console.log(`Fetching sitemap: ${SITEMAP_URL}`);
const urls = await fetchSitemap(SITEMAP_URL);
console.log(`Found ${urls.length} URLs\n`);
const results = [];
// Process in batches to control concurrency
for (let i = 0; i < urls.length; i += CONCURRENCY) {
const batch = urls.slice(i, i + CONCURRENCY);
const batchResults = await processBatch(batch);
results.push(...batchResults.map((r) => r.value ?? r.reason));
if (i + CONCURRENCY < urls.length) {
await new Promise((r) => setTimeout(r, DELAY_MS));
}
}
// Write summary
const summary = {
total: results.length,
succeeded: results.filter((r) => r.ok).length,
failed: results.filter((r) => !r.ok).length,
urls: results,
};
await fs.writeFile(
path.join(OUTPUT_DIR, "_results.json"),
JSON.stringify(summary, null, 2)
);
console.log(`\nDone: ${summary.succeeded}/${summary.total} succeeded`);
if (summary.failed > 0) {
console.log("Failed URLs:");
results.filter((r) => !r.ok).forEach((r) => console.log(` ${r.url}: ${r.error}`));
}
}
main().catch(console.error);
PAGEBOLT_API_KEY=your_key SITEMAP_URL=https://yoursite.com/sitemap.xml node crawl.js
With retry on failure
async function screenshotWithRetry(url, retries = 2) {
for (let attempt = 0; attempt <= retries; attempt++) {
try {
return await screenshot(url);
} catch (err) {
if (attempt === retries) throw err;
console.log(` Retry ${attempt + 1}/${retries} for ${url}`);
await new Promise((r) => setTimeout(r, 2000 * (attempt + 1)));
}
}
}
Mobile + desktop side by side
async function screenshotBoth(url) {
const [desktop, mobile] = await Promise.all([
screenshot(url),
fetch("https://pagebolt.dev/api/v1/screenshot", {
method: "POST",
headers: { "x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json" },
body: JSON.stringify({ url, viewportDevice: "iphone_14_pro", blockBanners: true }),
}).then((r) => r.arrayBuffer()).then((b) => Buffer.from(b)),
]);
const base = urlToFilename(url).replace(".png", "");
await fs.writeFile(path.join(OUTPUT_DIR, `${base}-desktop.png`), desktop);
await fs.writeFile(path.join(OUTPUT_DIR, `${base}-mobile.png`), mobile);
}
GitHub Actions — scheduled weekly archive
name: Weekly site archive
on:
schedule:
- cron: "0 3 * * 1" # Every Monday at 3am
workflow_dispatch:
jobs:
screenshot-sitemap:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Screenshot all pages
env:
PAGEBOLT_API_KEY: ${{ secrets.PAGEBOLT_API_KEY }}
SITEMAP_URL: https://yoursite.com/sitemap.xml
run: node scripts/screenshot-sitemap.js
- name: Upload archive
uses: actions/upload-artifact@v4
with:
name: site-archive-${{ github.run_id }}
path: screenshots/
retention-days: 90
Skip already-captured URLs
async function main() {
const existing = new Set(
(await fs.readdir(OUTPUT_DIR).catch(() => []))
.filter((f) => f.endsWith(".png"))
);
const urls = await fetchSitemap(SITEMAP_URL);
const pending = urls.filter((url) => !existing.has(urlToFilename(url)));
console.log(`${urls.length} total, ${pending.length} to capture (${existing.size} already done)`);
// process only pending...
}
For a 500-page site at 3 concurrent requests: roughly 3–4 minutes, zero browser setup, no memory management.
Get Started Free
100 requests/month, no credit card
Screenshot every page in your sitemap automatically — visual audits, client deliverables, site archives, and pre-migration snapshots with concurrency control.
Get Your Free API Key →