Прокси-серверы — неотъемлемая часть инфраструктуры для веб-скрапинга, мониторинга цен и сбора данных. В Ruby есть несколько способов работы с прокси: от встроенного Net::HTTP до специализированных библиотек. Разберём все варианты с примерами кода.
Net::HTTP: базовая работа с прокси
Net::HTTP — стандартная библиотека Ruby для HTTP-запросов. Поддержка прокси реализована через класс Net::HTTP::Proxy, который создаёт проксированный HTTP-клиент.
Базовый пример с аутентификацией
require 'net/http'
require 'uri'
class ProxyClient
PROXY_HOST = 'gate.proxyhat.com'
PROXY_PORT = 8080
PROXY_USER = 'your_username'
PROXY_PASS = 'your_password'
def initialize(proxy_host: PROXY_HOST, proxy_port: PROXY_PORT,
proxy_user: PROXY_USER, proxy_pass: PROXY_PASS)
@proxy_class = Net::HTTP::Proxy(proxy_host, proxy_port, proxy_user, proxy_pass)
end
def get(url, timeout: 30)
uri = URI.parse(url)
response = @proxy_class.start(uri.host, uri.port,
use_ssl: uri.scheme == 'https',
open_timeout: timeout,
read_timeout: timeout) do |http|
request = Net::HTTP::Get.new(uri.request_uri)
request['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
request['Accept'] = 'text/html,application/xhtml+xml'
http.request(request)
end
handle_response(response)
rescue Net::OpenTimeout => e
{ error: 'connection_timeout', message: e.message }
rescue Net::ReadTimeout => e
{ error: 'read_timeout', message: e.message }
rescue SocketError => e
{ error: 'dns_error', message: e.message }
rescue StandardError => e
{ error: 'unknown_error', message: e.message }
end
private
def handle_response(response)
case response
when Net::HTTPSuccess
{
status: response.code.to_i,
headers: response.each_header.to_h,
body: response.body
}
when Net::HTTPRedirection
{ status: response.code.to_i, redirect_to: response['Location'] }
when Net::HTTPClientError
{ status: response.code.to_i, error: 'client_error', body: response.body }
when Net::HTTPServerError
{ status: response.code.to_i, error: 'server_error', body: response.body }
end
end
end
# Использование
client = ProxyClient.new(
proxy_user: 'user-country-US',
proxy_pass: 'your_password'
)
result = client.get('https://httpbin.org/ip')
puts result.inspect
Повторные попытки с экспоненциальной задержкой
require 'net/http'
class ResilientProxyClient < ProxyClient
MAX_RETRIES = 3
BASE_DELAY = 1.0
def get_with_retry(url, max_retries: MAX_RETRIES)
retries = 0
loop do
result = get(url)
return result unless result[:error]
if retryable_error?(result[:error])
retries += 1
return result if retries > max_retries
delay = BASE_DELAY * (2 ** (retries - 1)) + rand(0.0..0.5)
puts "Retry #{retries}/#{max_retries} after #{delay.round(2)}s"
sleep(delay)
else
return result
end
end
end
private
def retryable_error?(error)
%w[connection_timeout read_timeout server_error].include?(error)
end
end
client = ResilientProxyClient.new
result = client.get_with_retry('https://example.com/api/data')
Typhoeus: параллельные запросы через libcurl
Typhoeus — обёртка вокруг libcurl с поддержкой параллельных запросов через Hydra. Идеален для высокопроизводительного скрапинга.
Одиночный запрос через прокси
require 'typhoeus'
class TyphoeusProxyClient
PROXY_URL = 'http://user-country-US:password@gate.proxyhat.com:8080'
def fetch(url, follow_redirects: true)
request = Typhoeus::Request.new(
url,
method: :get,
proxy: PROXY_URL,
followlocation: follow_redirects,
timeout: 30,
connecttimeout: 10,
headers: {
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept' => 'text/html,application/xhtml+xml',
'Accept-Language' => 'en-US,en;q=0.9'
}
)
request.run
response = request.response
if response.success?
{
status: response.code,
headers: response.headers,
body: response.body,
total_time: response.total_time
}
elsif response.timed_out?
{ error: 'timeout', message: 'Request timed out' }
elsif response.code == 0
{ error: 'connection_failed', message: response.return_message }
else
{ error: 'http_error', status: response.code, body: response.body }
end
end
end
client = TyphoeusProxyClient.new
result = client.fetch('https://httpbin.org/headers')
Параллельные запросы через Hydra
require 'typhoeus'
class ParallelScraper
attr_reader :proxy_url
def initialize(username: 'user-country-US', password: 'your_password')
@proxy_url = "http://#{username}:#{password}@gate.proxyhat.com:8080"
end
def fetch_all(urls, concurrency: 50)
hydra = Typhoeus::Hydra.new(max_concurrency: concurrency)
results = Concurrent::Hash.new
mutex = Mutex.new
urls.each_with_index do |url, index|
request = Typhoeus::Request.new(
url,
method: :get,
proxy: proxy_url,
timeout: 30,
headers: random_headers
)
request.on_complete do |response|
mutex.synchronize do
results[index] = process_response(response, url)
end
end
hydra.queue(request)
end
hydra.run
results.values
end
private
def random_headers
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
{
'User-Agent' => user_agents.sample,
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9',
'Accept-Language' => 'en-US,en;q=0.9'
}
end
def process_response(response, url)
{
url: url,
status: response.code,
success: response.success?,
body: response.success? ? response.body : nil,
error: response.success? ? nil : response.return_message,
time: response.total_time
}
end
end
# Требуется gem 'concurrent-ruby'
require 'concurrent'
scraper = ParallelScraper.new
urls = (1..100).map { |i| "https://httpbin.org/delay/#{rand(1..3)}?id=#{i}" }
results = scraper.fetch_all(urls, concurrency: 20)
success_count = results.count { |r| r[:success] }
puts "Успешно: #{success_count}/#{urls.size}"
puts "Среднее время: #{results.sum { |r| r[:time] } / results.size.round(3)}s"
ProxyHat Ruby SDK: ротация IP и геотаргетинг
ProxyHat предоставляет residential-прокси с ротацией IP и геотаргетингом. SDK упрощает работу с сессиями и ротацией.
require 'net/http'
require 'json'
require 'securerandom'
module ProxyHat
class Client
GATEWAY_HOST = 'gate.proxyhat.com'
HTTP_PORT = 8080
SOCKS_PORT = 1080
attr_reader :username, :password
def initialize(username:, password:)
@username = username
@password = password
end
# Создание прокси-URL с опциями
def proxy_url(country: nil, city: nil, session: nil, sticky: false)
user = build_username(country: country, city: city, session: session, sticky: sticky)
"http://#{user}:#{password}@#{GATEWAY_HOST}:#{HTTP_PORT}"
end
# SOCKS5 URL
def socks_url(country: nil, session: nil)
user = build_username(country: country, session: session, sticky: true)
"socks5://#{user}:#{password}@#{GATEWAY_HOST}:#{SOCKS_PORT}"
end
# HTTP-клиент с автоматической ротацией
def rotating_client(country: nil)
RotatingClient.new(self, country: country)
end
private
def build_username(country:, city:, session:, sticky:)
parts = [username]
parts << "country-#{country}" if country
parts << "city-#{city}" if city
parts << "session-#{session || SecureRandom.uuid}" if sticky
parts.join('-')
end
end
class RotatingClient
def initialize(client, country: nil)
@client = client
@country = country
end
def get(url)
session = SecureRandom.uuid[0..8]
proxy_url = @client.proxy_url(country: @country, session: session, sticky: true)
uri = URI.parse(url)
proxy_uri = URI.parse(proxy_url)
http = Net::HTTP.new(uri.host, uri.port,
proxy_uri.host, proxy_uri.port,
proxy_uri.user, proxy_uri.password)
http.use_ssl = (uri.scheme == 'https')
http.open_timeout = 30
http.read_timeout = 30
request = Net::HTTP::Get.new(uri.request_uri)
request['User-Agent'] = random_user_agent
response = http.request(request)
{ status: response.code.to_i, body: response.body }
end
private
def random_user_agent
[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) Firefox/121.0'
].sample
end
end
end
# Использование
proxy_hat = ProxyHat::Client.new(
username: 'your_username',
password: 'your_password'
)
# Ротация IP для каждого запроса (residential)
rotating = proxy_hat.rotating_client(country: 'US')
5.times do |i|
result = rotating.get('https://httpbin.org/ip')
puts "Request #{i + 1}: #{JSON.parse(result[:body])['origin']}"
end
# Sticky-сессия с геотаргетингом
session_id = SecureRandom.uuid
proxy_url = proxy_hat.proxy_url(
country: 'DE',
city: 'berlin',
session: session_id,
sticky: true
)
puts "Proxy URL: #{proxy_url}"
Производственный пример: скрапинг 1000 URL
Реальный сценарий — сбор данных с 1000 страниц с ротацией residential-прокси и обработкой ошибок.
require 'typhoeus'
require 'concurrent'
require 'json'
require 'logger'
class ProductionScraper
BATCH_SIZE = 100
MAX_CONCURRENCY = 50
RETRY_LIMIT = 3
def initialize(username:, password:, country: 'US', logger: nil)
@username = username
@password = password
@country = country
@logger = logger || Logger.new(STDOUT).tap { |l| l.level = Logger::INFO }
@stats = Concurrent::Hash.new(0)
end
def scrape(urls)
@logger.info "Starting scrape of #{urls.size} URLs"
start_time = Time.now
results = urls.each_slice(BATCH_SIZE).flat_map do |batch|
process_batch(batch)
end
log_summary(results, start_time)
results
end
private
def process_batch(urls)
hydra = Typhoeus::Hydra.new(max_concurrency: MAX_CONCURRENCY)
results = Concurrent::Array.new
urls.each do |url|
session_id = SecureRandom.uuid[0..12]
proxy_url = build_proxy_url(session_id)
request = build_request(url, proxy_url)
request.on_complete do |response|
results << handle_response(response, url)
end
hydra.queue(request)
end
hydra.run
results
end
def build_proxy_url(session_id)
"http://#{@username}-country-#{@country}-session-#{session_id}:#{@password}@gate.proxyhat.com:8080"
end
def build_request(url, proxy_url)
Typhoeus::Request.new(
url,
method: :get,
proxy: proxy_url,
timeout: 45,
connecttimeout: 15,
followlocation: true,
headers: {
'User-Agent' => random_user_agent,
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' => 'en-US,en;q=0.5',
'Accept-Encoding' => 'gzip, deflate',
'Connection' => 'keep-alive'
}
)
end
def handle_response(response, url)
@stats[:total] += 1
if response.success?
@stats[:success] += 1
{
url: url,
status: response.code,
success: true,
body: response.body,
size: response.body.bytesize,
time: response.total_time
}
elsif response.timed_out?
@stats[:timeout] += 1
{ url: url, success: false, error: 'timeout' }
elsif response.code == 403
@stats[:blocked] += 1
{ url: url, success: false, error: 'blocked', status: 403 }
elsif response.code == 429
@stats[:rate_limited] += 1
{ url: url, success: false, error: 'rate_limited', status: 429 }
else
@stats[:other_error] += 1
{ url: url, success: false, error: 'http_error', status: response.code }
end
end
def random_user_agent
@user_agents ||= File.read('user_agents.txt').split("\n") rescue [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'
]
@user_agents.sample
end
def log_summary(results, start_time)
elapsed = Time.now - start_time
success_rate = (@stats[:success].to_f / @stats[:total] * 100).round(2)
@logger.info "\n" + "="*50
@logger.info "Scraping completed in #{elapsed.round(2)}s"
@logger.info "Total requests: #{@stats[:total]}"
@logger.info "Success: #{@stats[:success]} (#{success_rate}%)"
@logger.info "Timeouts: #{@stats[:timeout]}"
@logger.info "Blocked (403): #{@stats[:blocked]}"
@logger.info "Rate limited (429): #{@stats[:rate_limited]}"
@logger.info "Other errors: #{@stats[:other_error]}"
@logger.info "Requests/sec: #{(@stats[:total] / elapsed).round(2)}"
@logger.info "="*50
end
end
# Запуск
scraper = ProductionScraper.new(
username: 'your_username',
password: 'your_password',
country: 'US'
)
urls = (1..1000).map { |i| "https://example.com/page/#{i}" }
results = scraper.scrape(urls)
# Сохранение успешных результатов
successful = results.select { |r| r[:success] }
File.write('results.json', JSON.pretty_generate(successful))
TLS/SSL: сертификаты и SNI
При работе через прокси важно правильно настроить SSL/TLS, особенно для сайтов с самоподписанными сертификатами или нестандартной конфигурацией.
Net::HTTP: настройка SSL
require 'net/http'
require 'openssl'
class SSLProxyClient
def initialize(proxy_host:, proxy_port:, proxy_user:, proxy_pass:)
@proxy_class = Net::HTTP::Proxy(proxy_host, proxy_port, proxy_user, proxy_pass)
end
# Строгая проверка SSL (по умолчанию)
def fetch_strict_ssl(url)
uri = URI.parse(url)
@proxy_class.start(uri.host, uri.port, use_ssl: true) do |http|
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = default_cert_store
http.ciphers = 'TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256'
http.min_version = OpenSSL::SSL::TLS1_2_VERSION
request = Net::HTTP::Get.new(uri.request_uri)
http.request(request)
end
end
# Ослабленная проверка (для самоподписанных сертификатов)
def fetch_relaxed_ssl(url)
uri = URI.parse(url)
@proxy_class.start(uri.host, uri.port, use_ssl: true) do |http|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
http.ssl_version = :TLSv1_2
# Отключаем SNI (иногда требуется для старых серверов)
http.sni_host_name = nil if http.respond_to?(:sni_host_name=)
request = Net::HTTP::Get.new(uri.request_uri)
http.request(request)
end
end
# Кастомный CA-сертификат
def fetch_with_custom_ca(url, ca_path)
uri = URI.parse(url)
@proxy_class.start(uri.host, uri.port, use_ssl: true) do |http|
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = OpenSSL::X509::Store.new.tap do |store|
store.add_file(ca_path)
store.set_default_paths
end
request = Net::HTTP::Get.new(uri.request_uri)
http.request(request)
end
end
# Клиентский сертификат (mTLS)
def fetch_with_client_cert(url, cert_path:, key_path:, key_password: nil)
uri = URI.parse(url)
@proxy_class.start(uri.host, uri.port, use_ssl: true) do |http|
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
http.cert_store = default_cert_store
http.cert = OpenSSL::X509::Certificate.new(File.read(cert_path))
http.key = OpenSSL::PKey::RSA.new(File.read(key_path), key_password)
request = Net::HTTP::Get.new(uri.request_uri)
http.request(request)
end
end
private
def default_cert_store
store = OpenSSL::X509::Store.new
store.set_default_paths
store
end
end
# Использование
client = SSLProxyClient.new(
proxy_host: 'gate.proxyhat.com',
proxy_port: 8080,
proxy_user: 'user-country-US',
proxy_pass: 'password'
)
# Стандартный HTTPS
response = client.fetch_strict_ssl('https://example.com')
# С самоподписанным сертификатом
response = client.fetch_relaxed_ssl('https://internal-server.local')
Typhoeus: SSL-опции
require 'typhoeus'
# Строгая проверка SSL
Typhoeus::Request.new(
'https://example.com',
proxy: 'http://user:pass@gate.proxyhat.com:8080',
ssl_verifypeer: true,
ssl_verifyhost: 2, # Проверка CN сертификата
sslversion: :tlsv1_2
).run
# Отключенная проверка (для тестирования)
Typhoeus::Request.new(
'https://self-signed.example',
proxy: 'http://user:pass@gate.proxyhat.com:8080',
ssl_verifypeer: false,
ssl_verifyhost: 0
).run
Интеграция с Ruby on Rails
Faraday middleware для прокси
require 'faraday'
require 'faraday/retry'
# config/initializers/proxy_client.rb
class ProxyFaradayMiddleware < Faraday::Middleware
def initialize(app, proxy_config:)
super(app)
@proxy_config = proxy_config
end
def call(env)
session = SecureRandom.uuid[0..12]
proxy_url = build_proxy_url(session)
env.request_headers['X-Proxy-Session'] = session
env[:proxy] = URI.parse(proxy_url)
@app.call(env)
rescue Faraday::TimeoutError, Faraday::ConnectionFailed
retry_with_new_session(env)
end
private
def build_proxy_url(session)
"http://#{@proxy_config[:username]}-country-#{@proxy_config[:country]}-session-#{session}:#{@proxy_config[:password]}@gate.proxyhat.com:8080"
end
def retry_with_new_session(env)
@proxy_config[:retry_count] ||= 0
return failure_response if @proxy_config[:retry_count] >= 3
@proxy_config[:retry_count] += 1
sleep(1 * @proxy_config[:retry_count])
call(env)
end
def failure_response
Faraday::Response.new(status: 502, body: 'Proxy error')
end
end
# Инициализация Faraday-клиента
module ApiClients
class ScrapingClient
PROXY_CONFIG = {
username: ENV['PROXYHAT_USERNAME'],
password: ENV['PROXYHAT_PASSWORD'],
country: 'US'
}
def connection
@connection ||= Faraday.new do |builder|
builder.use ProxyFaradayMiddleware, proxy_config: PROXY_CONFIG
builder.request :retry, {
max: 3,
interval: 1,
backoff_factor: 2,
retry_statuses: [429, 502, 503, 504]
}
builder.response :json, content_type: /json/
builder.adapter :typhoeus
end
end
def fetch_page(url)
connection.get(url).body
end
end
end
ActiveJob для фонового скрапинга
# app/jobs/scraping_job.rb
class ScrapingJob < ApplicationJob
queue_as :scraping
# Ограничение параллельных задач
sidekiq_options concurrency: 5
attr_reader :scraper
def perform(urls_batch, options = {})
@scraper = BatchScraper.new(
country: options['country'] || 'US',
username: ENV['PROXYHAT_USERNAME'],
password: ENV['PROXYHAT_PASSWORD']
)
results = scraper.process_batch(urls_batch)
store_results(results)
enqueue_failed(results)
rescue StandardError => e
Rails.logger.error "ScrapingJob failed: #{e.message}"
Rails.logger.error e.backtrace.first(10).join("\n")
raise e if attempts < 3
end
private
def store_results(results)
successful = results.select { |r| r[:success] }
return if successful.empty?
ScrapedPage.insert_all(
successful.map do |r|
{
url: r[:url],
content: r[:body],
scraped_at: Time.current,
status_code: r[:status]
}
end
)
end
def enqueue_failed(results)
failed = results.reject { |r| r[:success] }
return if failed.empty?
failed_urls = failed.map { |r| r[:url] }
ScrapingJob.set(wait: 5.minutes).perform_later(failed_urls)
end
end
# app/services/batch_scraper.rb
class BatchScraper
def initialize(country:, username:, password:)
@country = country
@username = username
@password = password
end
def process_batch(urls)
hydra = Typhoeus::Hydra.new(max_concurrency: 20)
results = []
mutex = Mutex.new
urls.each do |url|
request = build_request(url)
request.on_complete do |response|
mutex.synchronize do
results << format_result(response, url)
end
end
hydra.queue(request)
end
hydra.run
results
end
private
def build_request(url)
session = SecureRandom.uuid[0..12]
proxy = "http://#{@username}-country-#{@country}-session-#{session}:#{@password}@gate.proxyhat.com:8080"
Typhoeus::Request.new(
url,
proxy: proxy,
timeout: 30,
headers: {
'User-Agent' => random_user_agent,
'Accept' => 'text/html,application/xhtml+xml'
}
)
end
def format_result(response, url)
{\n url: url,
success: response.success?,
status: response.code,
body: response.body
}
end
def random_user_agent
['Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)'].sample
end
end
# Запуск через контроллер
class ScrapingController < ApplicationController
def create
urls = params[:urls]
batch_size = 50
urls.each_slice(batch_size).with_index do |batch, index|
ScrapingJob.set(wait: index.minutes).perform_later(batch, { 'country' => 'US' })
end
render json: { status: 'queued', batches: (urls.size / batch_size.to_f).ceil }
end
end
Сравнение подходов
| Метод | Параллельность | Производительность | Сложность | Идеально для |
|---|---|---|---|---|
Net::HTTP |
Нет | Низкая | Простая | Одиночные запросы, простые скрипты |
Typhoeus |
Да (Hydra) | Высокая | Средняя | Массовый скрапинг, параллельные задачи |
Faraday + Typhoeus |
Да | Высокая | Средняя | Rails-приложения, API-клиенты |
ProxyHat SDK |
Да | Высокая | Низкая | Ротация IP, геотаргетинг |
Ключевые выводы
- Net::HTTP — стандартная библиотека, подходит для простых задач без параллельности.
- Typhoeus — лучший выбор для высокопроизводительного скрапинга с параллельными запросами.
- ProxyHat — residential-прокси с ротацией IP и геотаргетингом через
gate.proxyhat.com:8080.- Всегда настраивайте retry-механизм с экспоненциальной задержкой для production-систем.
- Для Rails используйте Faraday middleware и ActiveJob для фоновых задач.
- SSL/TLS требует внимания: проверяйте сертификаты и используйте актуальные версии протоколов.
Готовы начать? Изучите тарифы ProxyHat и доступные локации для вашего следующего проекта.






