requests基本使用

  • 2018-11-07
  • 325
  • 0
# 多行输出结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

requests官方文档。本文是学习崔大的《Python3网络爬虫实战》而做得笔记,详细全部内容可购买他的书

1. requests基础

1.1 requests简单请求

import requests as rq

base_url = 'http://httpbin.org/'
# "httpbin.org" is A simple HTTP Request & Response Service
r = rq.get(base_url + 'get')  # 注意url不能省略scheme协议-http或https
print(r.text)
print(r.status_code)
print(r.cookies)
{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "origin": "222.41.155.17", 
  "url": "http://httpbin.org/get"
}

200
<RequestsCookieJar[]>
r = rq.post(base_url + 'post')
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "0", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "json": null, 
  "origin": "223.87.231.164", 
  "url": "http://httpbin.org/post"
}

r = rq.put(base_url + 'put')
print(r.text)
r = rq.delete(base_url + 'delete')
print(r.text)

1.2 get请求

1.2.1 提交数据以及构造header

params = {
    'name': 'Pauli',
    'age': 22
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
        'Chrome/68.0.3440.75 Safari/537.36'
}

test_url = 'http://httpbin.org/get'
r = rq.get(test_url, params=params, headers=headers)  # 注意url不能省略scheme协议-http或https
print(r.text)
{
  "args": {
    "age": "22", 
    "name": "Pauli"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"
  }, 
  "origin": "125.76.214.248", 
  "url": "http://httpbin.org/get?name=Pauli&age=22"
}

1.2.2 抓取二进制文件

图片,音频,视频都是由二进制码组成,获取它们的方式就是保存这些二进制码

img_url = 'https://github.com/favicon.ico'
r = rq.get(img_url)
with open('./temp/image0.png', 'wb') as f:
    f.write(r.content)
6518

requests下载文件高级用法:一个 chunk 一个 chunk 的下载, 而不是要全部下载完才能保存

r = rq.get(img_url, stream=True)  # stream loading
with open('./temp/image1.png', 'wb') as f:
    for chunk in r.iter_content(chunk_size=32):
        f.write(chunk)

1.3 post请求

注意与get的区别,post的数据是使用 data 参数而不是 params

post提交的标志:
– 数据在form里。
– “Content-Type”: “application/x-www-form-urlencoded”

data = {
    'name': 'Pauli',
    'age': 22
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
        'Chrome/68.0.3440.75 Safari/537.36'
}

test_url = 'http://httpbin.org/post'
r = rq.post(test_url, data=data, headers=headers)  # 注意url不能省略scheme协议-http或https
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "age": "22", 
    "name": "Pauli"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "17", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"
  }, 
  "json": null, 
  "origin": "125.76.214.248", 
  "url": "http://httpbin.org/post"
}

1.4 响应

r = rq.get('https://www.jianshu.com', headers=headers)
print(type(r.status_code), r.status_code, end='\n\n')
print(type(r.headers), r.headers, end='\n\n')
print(type(r.cookies), r.cookies, end='\n\n')
print(type(r.url), r.url, end='\n\n')
print(type(r.history), r.history, end='\n\n')
<class 'int'> 200

<class 'requests.structures.CaseInsensitiveDict'> {'Date': 'Tue, 21 Aug 2018 12:20:17 GMT', 'Server': 'Tengine', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'X-Frame-Options': 'DENY', 'X-XSS-Protection': '1; mode=block', 'X-Content-Type-Options': 'nosniff', 'ETag': 'W/"0e2092c82290fa25320b72aca25b73a7"', 'Cache-Control': 'max-age=0, private, must-revalidate', 'Set-Cookie': 'locale=zh-CN; path=/', 'X-Request-Id': '4105ff7e-9944-4c3b-86f6-e3150982f896', 'X-Runtime': '0.006964', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Content-Encoding': 'gzip', 'X-Via': '1.1 PStjdxro13:3 (Cdn Cache Server V2.0), 1.1 PSzjtzsx2kf43:3 (Cdn Cache Server V2.0), 1.1 PSscyd4iw27:1 (Cdn Cache Server V2.0)', 'Connection': 'keep-alive'}

<class 'requests.cookies.RequestsCookieJar'> <RequestsCookieJar[<Cookie locale=zh-CN for www.jianshu.com/>]>

<class 'str'> https://www.jianshu.com/

<class 'list'> []

2. requests高级用法

2.1 文件上传

参数:files

files = {'file': open('test_txt.txt', 'rb')}
r = rq.post('http://httpbin.org/post', files=files)
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {
    "file": "\u95ee\u9898: \u94f6\u6cb3\u7cfb\u4e2d\u5fc3\u7684\u8d85\u5927\u9ed1\u6d1e\u600e\u4e48\u5f62\u6210\u7684\uff1f\r\n\u4f5c\u8005: Berg\r\n==================================================\r\n\u95ee\u9898: \u94f6\u6cb3\u7cfb\u4e2d\u5fc3\u7684\u8d85\u5927\u9ed1\u6d1e\u600e\u4e48\u5f62\u6210\u7684\uff1f\r\n\u4f5c\u8005: Berg\r\n==================================================\r\n"
  }, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "396", 
    "Content-Type": "multipart/form-data; boundary=bc8f78e400224b90b5412b8c234f1911", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "json": null, 
  "origin": "125.76.214.248", 
  "url": "http://httpbin.org/post"
}

2.2 Cookies

参数:cookie

2.2.1 获取Cookies

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' +
        'Chrome/68.0.3440.75 Safari/537.36'
}

r = rq.get('https://www.baidu.com', headers=headers)
print(r.cookies)
# print(dir(r.cookies))
for key, value in r.cookies.items():
    print(key + ' = ' + value)
<RequestsCookieJar[<Cookie BAIDUID=B1131D8F421B9B548CD8263BE46F518D:FG=1 for .baidu.com/>, <Cookie BIDUPSID=B1131D8F421B9B548CD8263BE46F518D for .baidu.com/>, <Cookie H_PS_PSSID=26525_1431_21103_26350_27245_22157 for .baidu.com/>, <Cookie PSTM=1538313389 for .baidu.com/>, <Cookie delPer=0 for .baidu.com/>, <Cookie BDSVRTM=0 for www.baidu.com/>, <Cookie BD_HOME=0 for www.baidu.com/>]>
BAIDUID = B1131D8F421B9B548CD8263BE46F518D:FG=1
BIDUPSID = B1131D8F421B9B548CD8263BE46F518D
H_PS_PSSID = 26525_1431_21103_26350_27245_22157
PSTM = 1538313389
delPer = 0
BDSVRTM = 0
BD_HOME = 0

2.2.2 使用自己复制的cookies

方法一:将 cookies 传递给 headersCookie 参数

headers = {
    'Cookie': '_zap=66750a58-b5a3-40e6-824d-aa0711cc1696; ' +
        'd_c0="AMDmIgJnSQ6PTuQ0CAdhHin_dRQ9PaDITZs=|1538224118"; ' +
        'q_c1=381b12b4c5274837835194aeacb5308b|1538224122000|1538224122000; ' +
        'tgw_l7_route=69f52e0ac392bb43ffb22fc18a173ee6; _xsrf=tNjue5X6IgnCy1BWEqN0v2JrMRaF1G7E; ' +
        'capsion_ticket="2|1:0|10:1538314169|14:capsion_ticket|44:YWNhYTU0YThmYTZmNDcyOWI4Njc3ND' +
        'gyYjdhYTE2NWE=|852308bdfa48f5676233e506c8262062dc7af67db29b4bb75b667c157a6b2b98"; ' +
        'z_c0="2|1:0|10:1538314171|4:z_c0|92:Mi4xTmRJcEJRQUFBQUFBd09ZaUFtZEpEaVlBQUFCZ0FsVk' +
        '51eDJlWEFBb1JERHdQdnZIelVWWlRrMHAxUXl3YjktSUVR|85e876c90cbb14a1fe0da8a6d848144b935' +
        'c4b9e16b9d0cfad0a6e51fe919f07"; tst=r',
    'Host': 'www.zhihu.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
        '(KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
r = rq.get('https://www.zhihu.com/', headers=headers)
result = '写文章' in r.text  # 登陆的用户才有此用户菜单栏
print(result)
True

方法二: 再发生请求时传递给参数 cookies,不过需要先将 cookies 构造成 RequestsCookieJar 对象

cookies = '_zap=66750a58-b5a3-40e6-824d-aa0711cc1696; ' +\
    'd_c0="AMDmIgJnSQ6PTuQ0CAdhHin_dRQ9PaDITZs=|1538224118"; ' +\
    'q_c1=381b12b4c5274837835194aeacb5308b|1538224122000|1538224122000; ' +\
    'tgw_l7_route=69f52e0ac392bb43ffb22fc18a173ee6; _xsrf=tNjue5X6IgnCy1BWEqN0v2JrMRaF1G7E; ' +\
    'capsion_ticket="2|1:0|10:1538314169|14:capsion_ticket|44:YWNhYTU0YThmYTZmNDcyOWI4Njc3ND' +\
    'gyYjdhYTE2NWE=|852308bdfa48f5676233e506c8262062dc7af67db29b4bb75b667c157a6b2b98"; ' +\
    'z_c0="2|1:0|10:1538314171|4:z_c0|92:Mi4xTmRJcEJRQUFBQUFBd09ZaUFtZEpEaVlBQUFCZ0FsVk' +\
    '51eDJlWEFBb1JERHdQdnZIelVWWlRrMHAxUXl3YjktSUVR|85e876c90cbb14a1fe0da8a6d848144b935' +\
    'c4b9e16b9d0cfad0a6e51fe919f07"; tst=r'

# 构造过程
cookiesJar = rq.cookies.RequestsCookieJar()
for cookie in cookies.split(';'):
    key, value = cookie.split('=', 1)
    cookiesJar.set(key, value)

# 模拟使用
headers = {
    'Host': 'www.zhihu.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
        '(KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
r = rq.get('https://www.zhihu.com/', cookies=cookiesJar, headers=headers)
result = '写文章' in r.text  # 登陆的用户才有此用户菜单栏
print(result)

2.3 会话维持

类:requests.Session()
在requests中,每次的请求都相当于各自独立的浏览器中打开,而有时需要在同一浏览器连续打开页面,那就需要使用相同的cookies,可以每次发生请求都加上cookies,但这样太繁琐。可以维护requests的一个会话来做到。

rq.get('http://httpbin.org/cookies/set/num/65536')
r = rq.get('http://httpbin.org/cookies')
print(r.text)
{
  "cookies": {}
}

http://httpbin.org/cookies/set/num/65536 请求此网址会生成一个cookies,名称num,值65536。
然后又请求http://httpbin.org/cookies,此网址可以获取当前的cookies,发现并不能获取到刚设置的cookies。
下面使用会话即可轻松获取

sess = rq.Session()
sess.get('http://httpbin.org/cookies/set/num/65536')
r = sess.get('http://httpbin.org/cookies')
print(r.text)
{
  "cookies": {
    "num": "65536"
  }
}

2.4 SSL证书验证

参数:verify
有时有的网站的证书没有被官方CA信任,会出现证书错误情况。将请求的 verify 参数设置为 False,即传入参数verify=False
此时即可正常请求,但任然有警告。
忽略警告的方法:
python
from requests.packages import urllib3
urllib3.disable_warnings()


- ```python import logging logging.captureWarnings(True)

也可以加载本地的证书(私有证书必须是解密的)

requests.get(url, cert=('/path/server.crt', '/path/key'))

2.5 代理设置

参数:proxies

  1. 一般代理
proxies = {
    'http': 'http://0.0.0.0:0',
    'https': 'http://0.0.0.1:0',
}
rq.get('*', proxies=proxies)
  1. HTTP Basic Auth 代理,使用语法 http://user:passwd@host:port
proxies = {
    'http': 'http://user:passwd@host:port'
}
rq.get('*', proxies=proxies)
  1. SOCKS 协议代理,需要先安装sockspip3 install 'requests[socks]'
proxies = {
    'http': 'socks5://user:passwd@host:port'
}
rq.get('*', proxies=proxies)

2.6 超时设置

参数 timeout 。默认是 None ,也就是一直等待
有时因为网络原因或者服务器原因导致网页响应缓慢,可以设置超时时间。

r = rq.get('https://www.baidu.com', timeout=1)
print(r.status_code)
200

2.7 身份认证

  • HTTPBasicAuth 认证参数:auth
r = rq.get('*', auth=('username', 'passwd'))
print(r.status_code)

2.8 Prepared Request

Prepared Request: Request 的各个参数都可以通过一个 Request 对象来表示

url = 'http://httpbin.org/post'
data = {
    'name': 'Planck'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
        '(KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}

sess = rq.Session()
req = rq.Request('POST', url, data=data, headers=headers)
prepped = sess.prepare_request(req)
r = sess.send(prepped)
print(r.text)
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Planck"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"
  }, 
  "json": null, 
  "origin": "222.41.155.17", 
  "url": "http://httpbin.org/post"
}


版权声明: 本网站所有资源采用BY-NC-SA 4.0协议进行授权,转载应当以相同方式注明文章来自:requests基本使用 - 一方的天地

评论

还没有任何评论,你来说两句吧

发表评论

陕ICP备18010914号
知识共享许可协议
本作品由一方天地采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可,转载或引用本站文章应遵循相同协议。如果有侵犯版权的资源请尽快联系站长,本站会在24h内删除有争议的资源。 -