为什么要为代理服务器打拼游戏
Playwright是一个来自微软的现代浏览器自动化框架,支持Chrimium,Firefox,和WebKit. 与仅HTTP库不同,Playwright会制作完整页面——执行JavaScript,处理动态内容,通过拒绝原始HTTP请求的反机器人检查.
与 住宅代办, Playwright 成为从被严密保护的网站收集数据的最有效工具之一. 本指南涵盖每个关卡的代理配置:浏览器全宽,按内容,以及每页——工作代码可以直接复制到您的项目中.
本指南假设您有代理用户账户 。 如果你是新来的代理,从 什么是代理服务器? 然后回顾一下我们 网络刮刮的最佳代理 概览。
安装和设置
节点.js(初级)
# Install Playwright
npm init -y
npm install playwright
# Download browser binaries
npx playwright install chromium
Py
# Install Playwright for Python
pip install playwright
python -m playwright install chromium
浏览器- 级别代理配置
最简单的方法是在浏览器发布时设置一个代理. 每个上下文和页面自动继承此代理.
节点.js - 浏览器-Wide代理
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: 'USERNAME',
password: 'PASSWORD',
},
});
const page = await browser.newPage();
await page.goto('https://httpbin.org/ip');
console.log(await page.textContent('body'));
await browser.close();
})();
Python - 浏览器- Wide 代理服务器
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
proxy={
"server": "http://gate.proxyhat.com:8080",
"username": "USERNAME",
"password": "PASSWORD",
}
)
page = browser.new_page()
page.goto("https://httpbin.org/ip")
print(page.text_content("body"))
browser.close()
Per- Context 代理旋转
Playwright的真正力量在于 浏览器上下文。每个上下文都是一个孤立的会话——单独的饼干,存储和缓存——并且可以有自己的代理. 这是推荐的代理旋转模式,因为它避免了为每个IP启动新浏览器的间接费用.
节点(js)——按内容旋转
const { chromium } = require('playwright');
const crypto = require('crypto');
async function createProxiedContext(browser) {
const sessionId = crypto.randomBytes(4).toString('hex');
const context = await browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: `USERNAME-session-${sessionId}`,
password: 'PASSWORD',
},
});
return context;
}
(async () => {
// Launch browser WITHOUT a proxy — set it per context
const browser = await chromium.launch();
const urls = [
'https://example.com/page/1',
'https://example.com/page/2',
'https://example.com/page/3',
];
for (const url of urls) {
const context = await createProxiedContext(browser);
const page = await context.newPage();
try {
await page.goto(url, { timeout: 30000 });
const content = await page.content();
console.log(`Fetched ${url} — ${content.length} chars`);
} catch (err) {
console.error(`Failed ${url}: ${err.message}`);
} finally {
await context.close(); // Releases the session
}
}
await browser.close();
})();
Python —— 文字旋转
import uuid
from playwright.sync_api import sync_playwright
def create_proxied_context(browser):
session_id = uuid.uuid4().hex[:8]
context = browser.new_context(
proxy={
"server": "http://gate.proxyhat.com:8080",
"username": f"USERNAME-session-{session_id}",
"password": "PASSWORD",
}
)
return context
with sync_playwright() as p:
browser = p.chromium.launch()
urls = [
"https://example.com/page/1",
"https://example.com/page/2",
"https://example.com/page/3",
]
for url in urls:
context = create_proxied_context(browser)
page = context.new_page()
try:
page.goto(url, timeout=30000)
print(f"Fetched {url} — {len(page.content())} chars")
except Exception as e:
print(f"Failed {url}: {e}")
finally:
context.close()
browser.close()
地理目标背景
在刮去本地化内容时,可以将ProxyHat的地理目标设置与Playwright的本地化和时区设置结合起来,以达到最大真实性. 见我们所有可用的地点 地点页面。 。 。
const { chromium } = require('playwright');
const GEO_PROFILES = {
us: { locale: 'en-US', timezone: 'America/New_York', country: 'us' },
de: { locale: 'de-DE', timezone: 'Europe/Berlin', country: 'de' },
jp: { locale: 'ja-JP', timezone: 'Asia/Tokyo', country: 'jp' },
};
async function createGeoContext(browser, region) {
const profile = GEO_PROFILES[region];
return browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: `USERNAME-country-${profile.country}`,
password: 'PASSWORD',
},
locale: profile.locale,
timezoneId: profile.timezone,
geolocation: null,
});
}
(async () => {
const browser = await chromium.launch();
for (const region of ['us', 'de', 'jp']) {
const context = await createGeoContext(browser, region);
const page = await context.newPage();
await page.goto('https://example.com/pricing');
console.log(`${region.toUpperCase()}: ${await page.title()}`);
await context.close();
}
await browser.close();
})();
与工人池同时擦拭
剧作家背景轻巧. 可以并行运行多个上下文,每个上下文都有不同的代理会话,以大幅提高吞吐量.
const { chromium } = require('playwright');
const crypto = require('crypto');
const MAX_CONCURRENCY = 5;
async function scrapeUrl(browser, url) {
const sessionId = crypto.randomBytes(4).toString('hex');
const context = await browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: `USERNAME-session-${sessionId}`,
password: 'PASSWORD',
},
});
const page = await context.newPage();
try {
await page.goto(url, { timeout: 30000, waitUntil: 'domcontentloaded' });
const title = await page.title();
return { url, title, success: true };
} catch (err) {
return { url, error: err.message, success: false };
} finally {
await context.close();
}
}
async function scrapeAll(urls) {
const browser = await chromium.launch();
const results = [];
// Process in batches of MAX_CONCURRENCY
for (let i = 0; i < urls.length; i += MAX_CONCURRENCY) {
const batch = urls.slice(i, i + MAX_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(url => scrapeUrl(browser, url))
);
results.push(...batchResults);
console.log(`Completed batch ${Math.floor(i / MAX_CONCURRENCY) + 1}`);
}
await browser.close();
return results;
}
// Usage
const urls = Array.from({ length: 20 }, (_, i) =>
`https://example.com/product/${i + 1}`
);
scrapeAll(urls).then(results => {
const success = results.filter(r => r.success).length;
console.log(`Success: ${success}/${results.length}`);
});
关于更先进的货币模式,见我们的指南 用货币控制缩放代理请求。 。 。
隐形配置
默认的 Playwright 浏览器有可探测的自动化标记. 这些设置会减少您的指纹, 帮助绕过 反机器人系统。 。 。
基本隐形设置
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({
args: [
'--disable-blink-features=AutomationControlled',
'--disable-features=IsolateOrigins,site-per-process',
],
});
const context = await browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: 'USERNAME',
password: 'PASSWORD',
},
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York',
deviceScaleFactor: 1,
hasTouch: false,
isMobile: false,
javaScriptEnabled: true,
});
// Remove automation markers
await context.addInitScript(() => {
// Override navigator.webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
// Override navigator.plugins to look real
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
// Override navigator.languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
// Override chrome.runtime to avoid detection
window.chrome = { runtime: {} };
});
const page = await context.newPage();
await page.goto('https://bot.sannysoft.com/');
await page.screenshot({ path: 'stealth-test.png' });
await browser.close();
})();
Python 隐形配置
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
args=[
"--disable-blink-features=AutomationControlled",
]
)
context = browser.new_context(
proxy={
"server": "http://gate.proxyhat.com:8080",
"username": "USERNAME",
"password": "PASSWORD",
},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36",
viewport={"width": 1920, "height": 1080},
locale="en-US",
timezone_id="America/New_York",
)
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
window.chrome = { runtime: {} };
""")
page = context.new_page()
page.goto("https://httpbin.org/headers")
print(page.text_content("body"))
browser.close()
用代理旋转重试逻辑
将重试逻辑与自动代理旋转相结合,可确保以新鲜的IP和上下文重试失败的请求.
const { chromium } = require('playwright');
const crypto = require('crypto');
async function fetchWithRetry(browser, url, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
const sessionId = crypto.randomBytes(4).toString('hex');
const context = await browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: `USERNAME-session-${sessionId}`,
password: 'PASSWORD',
},
});
const page = await context.newPage();
try {
const response = await page.goto(url, {
timeout: 30000,
waitUntil: 'domcontentloaded',
});
if (response && response.status() >= 400) {
console.log(`Attempt ${attempt}: HTTP ${response.status()}, retrying...`);
await context.close();
continue;
}
const html = await page.content();
await context.close();
return html;
} catch (err) {
console.log(`Attempt ${attempt} failed: ${err.message}`);
await context.close();
if (attempt === maxRetries) {
throw new Error(`All ${maxRetries} attempts failed for ${url}`);
}
// Exponential backoff
await new Promise(r => setTimeout(r, 1000 * Math.pow(2, attempt - 1)));
}
}
}
(async () => {
const browser = await chromium.launch();
try {
const html = await fetchWithRetry(browser, 'https://example.com/data');
console.log(`Fetched ${html.length} chars`);
} catch (err) {
console.error(err.message);
}
await browser.close();
})();
SOCKS5 与 Playwright 的代理服务器
代理Hat还支持1080端口的SOCKS5. 当您需要协议不可知代理时, 或想要避免 HTTP CONNECT 管理时, 这样做是有用的 。
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({
proxy: {
server: 'socks5://gate.proxyhat.com:1080',
username: 'USERNAME',
password: 'PASSWORD',
},
});
const page = await browser.newPage();
await page.goto('https://httpbin.org/ip');
console.log(await page.textContent('body'));
await browser.close();
})();
生产涂鸦模式
这里有一个完整的生产准备的刮纸机,它结合了以上所有模式——按照文本代理旋转,隐形设置,重试逻辑,货币,以及结构化的数据提取.
const { chromium } = require('playwright');
const crypto = require('crypto');
const fs = require('fs');
class PlaywrightScraper {
constructor({ concurrency = 3, maxRetries = 3 }) {
this.concurrency = concurrency;
this.maxRetries = maxRetries;
this.browser = null;
this.results = [];
this.stats = { success: 0, failed: 0 };
}
async init() {
this.browser = await chromium.launch({
args: ['--disable-blink-features=AutomationControlled'],
});
}
_createContext() {
const sessionId = crypto.randomBytes(4).toString('hex');
return this.browser.newContext({
proxy: {
server: 'http://gate.proxyhat.com:8080',
username: `USERNAME-session-${sessionId}`,
password: 'PASSWORD',
},
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
});
}
async scrapePage(url) {
for (let attempt = 1; attempt <= this.maxRetries; attempt++) {
const context = await this._createContext();
const page = await context.newPage();
try {
const response = await page.goto(url, {
timeout: 30000,
waitUntil: 'networkidle',
});
if (!response || response.status() >= 400) {
await context.close();
continue;
}
// Extract data — customize this for your target
const data = await page.evaluate(() => ({
title: document.title,
text: document.body.innerText.substring(0, 500),
}));
await context.close();
this.stats.success++;
return { url, ...data, success: true };
} catch (err) {
await context.close();
if (attempt === this.maxRetries) {
this.stats.failed++;
return { url, error: err.message, success: false };
}
await new Promise(r => setTimeout(r, 1000 * attempt));
}
}
}
async scrapeAll(urls) {
await this.init();
for (let i = 0; i < urls.length; i += this.concurrency) {
const batch = urls.slice(i, i + this.concurrency);
const batchResults = await Promise.all(
batch.map(url => this.scrapePage(url))
);
this.results.push(...batchResults);
}
await this.browser.close();
console.log(`Done: ${this.stats.success} OK, ${this.stats.failed} failed`);
return this.results;
}
}
// Usage
const scraper = new PlaywrightScraper({ concurrency: 5, maxRetries: 3 });
const urls = Array.from({ length: 50 }, (_, i) =>
`https://example.com/item/${i + 1}`
);
scraper.scrapeAll(urls).then(results => {
fs.writeFileSync('results.json', JSON.stringify(results, null, 2));
});
关于构建可重复使用的代理抽象层,请参见 构建代理软件中层。探索 节点 SDK 和 Python SDK 键盘 用于简化代理管理,并检查 代理用户定价 开始吧






