Scapy代理设置__爬取代理实战
设置代理可以避免爬虫频繁爬取被封
付费代理就不用说了,我这里爬取免费代理
Scrapy代理使用HttpProxyMiddlewar中间件(默认开启)
在Scrapy中设置代理本质是将代理服务器的url写入request.meta['proxy']
如果代理需要身份验证,需要通过HTTP头部的Proxy-Authorization字段传递用户账号和密码的身份验证信息
实战
接下来进行实战
xicidaili貌似没了
我用的 国内高匿免费HTTP代理IP - 快代理 (kuaidaili.com)
一上来三连鞭
scrapy startproject proxy;
cd proxy;
scrapy genspider kuaidaili kuaidaili.com;
然后直接开爬
import json
import scrapy
class KuaidailiSpider(scrapy.Spider):
name = 'kuaidaili'
allowed_domains = ['kuaidaili.com']
# start_urls = ['http://kuaidaili.com/free/inha']
def start_requests(self):
for i in range(1, 4):
yield scrapy.Request("https://www.kuaidaili.com/free/inha/%s/" % i)
def parse(self, response):
for sel in response.xpath("//tbody//tr"):
ip = sel.xpath(".//td[1]/text()").extract_first()
port = sel.xpath(".//td[2]/text()").extract_first()
scheme = sel.xpath(".//td[4]/text()").extract_first().lower()
# 使用爬取的代理验证
url = "%s://httpbin.org/ip" % scheme
proxy = "%s://%s:%s" % (scheme, ip, port)
meta = {
'proxy': proxy,
'dont_retry': True,
'download_timeout': 10,
# 以下两个字段用于检验
'_proxy_scheme': scheme,
'_proxy_ip': ip
}
yield scrapy.Request(url, callback=self.check_available, meta=meta, dont_filter=True)
def check_available(self, response):
proxy_ip = response.meta['_proxy_ip_']
if proxy_ip == json.loads(response.text)['origin']:
yield
{
'proxy_scheme': response.meta['_proxy_scheme'],
'proxy': response.meta['proxy']
}
settings.py改一下设置
我尝试后发现直接503
还没有得到解决…
再次尝试
于是我更换了网站,换了个墙外的
这个尝试了一下 貌似是动态网站 我懒
所以又换了一个
Free Proxy List - Just Checked Proxy List (free-proxy-list.net)
这个可以
import json
import scrapy
class FreeScrapySpider(scrapy.Spider):
name = 'free_proxy'
allowed_domains = ['free-proxy-list.net']
start_urls = ["https://free-proxy-list.net/"]
# def start_requests(self):
# for i in range(1, 4):
# yield scrapy.Request("http://proxy-list.org/english/index.php?p=%s" % i)
def parse(self, response):
for sel in response.xpath("//tbody//tr"):
ip = sel.xpath("./td[1]/text()").extract_first()
port = sel.xpath("./td[2]/text()").extract_first()
scheme = sel.xpath("./td[7]/text()").extract_first()
if scheme == 'yes':
scheme = 'https'
else:
scheme = 'http'
url = "%s://httpbin.org/ip" % scheme
proxy = "%s://%s:%s" % (scheme, ip,port)
# ip = ip_port.split(":")[0]
meta = {
'proxy': proxy,
'download_timeout': 10,
'dont_retry': True,
'_proxy_scheme': scheme,
'_proxy_ip': ip
}
yield scrapy.Request(url, callback=self.check_available, meta=meta, dont_filter=True)
def check_available(self, response):
proxy_ip = response.meta['_proxy_ip']
if proxy_ip == json.loads(response.text)['origin']:
yield {
'proxy_scheme': response.meta['_proxy_scheme'],
'proxy': response.meta['proxy']
}
然后实现随机代理
class RandomHttpProxyMiddleware(HttpProxyMiddleware):
def __init__(self,auth_encoding='latin-1',proxy_list_file=None):
if not proxy_list_file:
raise scrapy.exceptions.NotConfigured
self.auth_encoding = auth_encoding
self.proxies = defaultdict(list)
with open(proxy_list_file) as f:
proxy_list = json.load(f)
for proxy in proxy_list:
scheme = proxy['proxy_scheme']
url = proxy['proxy']
self.proxies[scheme].append(self._get_proxy(url,scheme))
@classmethod
def from_crawler(cls, crawler):
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING','latin-1')
proxy_list_file = crawler.settings.get("HTTPPROXY_PROXY_LIST_FILE")
return cls(auth_encoding,proxy_list_file)
def _set_proxy(self, request, scheme):
creds,proxy = random.choice(self.proxies[scheme])
request.meta['proxy'] = proxy
if creds:
request.headers["Proxy-Authorization"]=b'Basic'+creds
并在settings.py里更改
DOWNLOADER_MIDDLEWARES = {
# 'free_proxy.middlewares.FreeProxyDownloaderMiddleware': 543,
'free_proxy.middlewares.RandomHttpProxyMiddleware':745
}
HTTPPROXY_PROXY_LIST_FILE = 'proxy_list_json'
import scrapy
import json
class HubSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['httpbin.org']
# start_urls = ['http://httpbin.org/ip']
def start_requests(self):
for i in range(10):
yield scrapy.Request('http://httpbin.org/ip', dont_filter=True)
yield scrapy.Request('https://httpbin.org/ip', dont_filter=True)
def parse(self, response):
print(json.loads(response.text))
yield {
'proxy':json.loads(response.text)['origin']
}
# 写一个测试程序
可以看到得到的代理是设置的代理
Comments NOTHING