import urllib.request
def getHtml(url):
response = urllib.request.urlopen(url, timeout=5)
result = response.read().decode('utf-8')
return result
import requests
result = requests.get(url).text
userAgentList = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; "
".NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR "
"2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR "
"3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; "
".NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR "
"3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 ("
"Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 "
"Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
referer
cookie
两个库添加 header 的方法
import urllib.request
import requests
header = {
'user-agent': 'add your agent',
'referer': 'add your referer',
'cookie': 'add your cookie',
}
# urllib 方法
req = urllib.request.Request('https://www.zhihu.com/hot', headers=header)
response = urllib.request.urlopen(req, timeout=5)
content = response.read().decode('utf-8')
# requests 方法
content = requests.get('https://www.zhihu.com/hot', headers=header).text
添加代理
# 构建代理Handler
httpproxy_handler = urllib.request.ProxyHandler(
{
"http" : "222.221.11.119:3128",
"https": "125.32.233.215:8118"
},
)
# 通过 urllib.request.build_opener(),创建自定义opener对象
opener = urllib.request.build_opener(httpproxy_handler)
request = urllib.request.Request("http://www.baidu.com/")
# 1. 如果这么写,只有使用opener.open()方法发送请求才使用自定义的代理,而urlopen()则不使用自定义代理。
response = opener.open(request)
# 2. 如果这么写,就是将opener应用到全局,之后所有的,不管是opener.open()还是urlopen() 发送请求,都将使用自定义代理。
# urllib.request.install_opener(opener)
# response = urlopen(request)
content = response.read().decode()
proxies = {
"http": "http://10.10.1.10:3128",
"https": "http://10.10.1.10:1080",
}
content = requests.get("http://example.org", proxies=proxies).text
我们访问的 url 通常会包含中文的参数信息,比如先随便找了一个网站:搜百度盘,我搜索一下:句号,得到的 url 表面上时这样的:
https://www.sobaidupan.com/search.asp?wd=句号&so_md5key=5c829f0990bbca09ba8eb56123e11197
简单解释一下,熟悉 url 的应该都能看懂,?后面就是参数,参数之间用 & 分开。那么显然 wd 就是搜索的关键词,后面的 md5 应该时类似验证码的东西。我们将其删掉,结果发现得到的结果是一样的。
把后面的去掉(我们访问的时候希望不要带那个参数)复制以后粘贴到 python 中,结果变成了:
https://www.sobaidupan.com/search.asp?wd=%E5%8F%A5%E5%8F%B7
显然,中文是经过了转码的。urllib 库有专门的转码函数,如下
import urllib.parse
word = '句号'
urlWord = urllib.parse.quote(keyword)
briefs = {'page': 10, 'wd': '句号'}
urlWord = urllib.parse.urlencode(briefs)
# page=10&wd=%E5%8F%A5%E5%8F%B7
本文章使用limfx的vsocde插件快速发布