编码流程:
1. 指定url
2. 发起请求
3. 获取相应数据
4. 数据解析
5. 持久化存储
爬取页面中指定的页面内容
1. 正则
2. bs4
3. xpath(***)
1. 解析的局部文本内容都会在标签之间或者标签对应的属性中进行存储
2. 进行指定标签的定位
3. 标签或者标签对应的属性中存储的数据值进行提取(解析)
需求:爬取糗事百科中糗图模块下所有的图片
#爬取图片
import requests
import re
if __name__ == "__main__":
#如何爬取图片数据
url = 'https://pic.qiushibaike.com/system/pictures/12355/123558951/medium/MDGNC2ANV4CG11N9.jpg'
#content返回的二进制形式的图片数据
#text(字符串);content(二进制);json()(对象)
img_data = requests.get(url=url).content
with open('./python爬虫/第三章:数据解析/qiutu.jpg','wb') as fp:
fp.write(img_data)
# 正则爬取图片
import requests
import re
import os
if __name__ == "__main__":
#创建一个文件夹,保存所有的图片
if not os.path.exists('./python爬虫/第三章:数据解析/qiutuLibs'):
os.mkdir('./python爬虫/第三章:数据解析/qiutuLibs')
#如何爬取图片数据
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
#使用聚焦爬虫将页面中所有的糗图进行解析/提取
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(ex, page_text, re.S)
for src in img_src_list:
#拼接出一个完整的图片url
src = 'https:' + src
#请求到了图片的二进制数据
img_data = requests.get(url=src, headers=headers).content
#生成图片名称
img_name = src.split('/')[-1]
#图片存储的路径
imgPath = './python爬虫/第三章:数据解析/qiutuLibs/'+img_name
with open(imgPath,'wb') as fp:
fp.write(img_data)
print(img_name, '下载成功!')
# 分页爬取图片
import requests
import re
import os
if __name__ == "__main__":
#创建一个文件夹,保存所有的图片
if not os.path.exists('./python爬虫/第三章:数据解析/qiutuLibs'):
os.mkdir('./python爬虫/第三章:数据解析/qiutuLibs')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
#设置一个通用的url模板
url = 'https://www.qiushibaike.com/imgrank/page/%d'
for pageNum in (1, 36):
#对应页码的url
new_url = format(url % pageNum)
page_text = requests.get(url=new_url, headers=headers).text
#使用聚焦爬虫将页面中所有的糗图进行解析/提取
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(ex, page_text, re.S)
for src in img_src_list:
#拼接出一个完整的图片url
src = 'https:' + src
#请求到了图片的二进制数据
img_data = requests.get(url=src, headers=headers).content
#生成图片名称
img_name = src.split('/')[-1]
#图片存储的路径
imgPath = './python爬虫/第三章:数据解析/qiutuLibs/'+img_name
with open(imgPath,'wb') as fp:
fp.write(img_data)
print(img_name, '下载成功!')
python中独有的解析方式
1. 实例化一个BeautifulSoup对象,并且将页面源码数据加载到该对象中
2. 通过调用BeautifulSoup对象中相关的属性或者方法进行标签定位和数据提取
pip install bs4
pip install lxml
1. 对象的实例化
1. 将本地的html文档中的数据加载到该对象中
2. 将互联网中获取的页面源码加载到该对象中
#方法一
from bs4 import BeautifulSoup
if __name__ == "__main__":
#将本地的html文档中的数据加载到该对象中
fp = open('./test.html', 'r', encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
#方法二
from bs4 import BeautifulSoup
if __name__ == "__main__":
#将本地的html文档中的数据加载到该对象中
page_text = requests.get(url='').text
soup = BeautifulSoup(page_text,'lxml')
2. 提供的用于数据解析的方法和属性
- soup.tagName:返回的是html中第一次出现的tagName
- soup.find()
+ soup.find('tagName'):等同于soup.tagName
+ 属性定位:soup.find('div', class_/id/attr='song')
- soup.find_all('tagName'):返回符合要求的所有标签(列表)
- select
+ soup.select('某种选择器(id, class, 标签)'):返回的是一个列表
+ soup.select('.tang > ul > li > a'):> 表示的是一个层级s
+ oup.select('.tang > ul a'):空格表示多个层级
- 获取标签之间的文本数据
+ soup.a.text/string/get_text()
+ text/get_text():可以获取某一个标签中所有的文本内容
+ string:只可以获取该标签下面直系的文本内容
- 获取标签中属性值
+ soup.a['href']
需求:爬取三国演义小说所有的章节标题和章节内容
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
#对首页的数据进行爬取
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=url, headers=headers).text
#在首页中解析出章节的标题和详情页的url
#1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
soup = BeautifulSoup(page_text, 'lxml')
#2. 解析章节标题和详情页url
li_list = soup.select('.book-mulu >ul > li')
fp = open('./python爬虫/第三章:数据解析/sanguo.txt', 'w', encoding='utf-8')
for li in li_list:
title = li.a.string
detail = 'https://www.shicimingju.com' + li.a['href']
#对详情页发起请求,解析出章节内容
detail_page_text = requests.get(url=detail, headers=headers).text
#解析出详情页中相关的章节内容
detail_soup = BeautifulSoup(detail_page_text, 'lxml')
div_tag = detail_soup.find('div', class_='chapter-content')
#解析到了章节的内容
content = div_tag.text
fp.write(title + ":"+ content+ '\n')
print(title, '爬取成功')
1. 实例化一个etree的对象,且需要将被解析的页面源码数据加载到该对象中
2. 调用etree对象中的xpath方法,结合着xpath表达式实现标签的定位和内容的捕获
pip install lxml
from lxml import etree
#本地html数据
etree.parse(filePath)
#互联网获取的源码数据
etree.HTML('page_text')
//div[@class='song']//div[@class='song']/p[3] 索引是从1开始的from lxml import etree
if __name__ == "__main__":
#实例化好了一个etree对象,且将被解析的源码加载到了该对象中
tree = etree.parse('test.html')
# r = tree.xpath('/html/head/title')
# r = tree.xpath('/html//title')
# r = tree.xpath('//title')
# r = tree.xpath('//div[@class='song']')
# r = tree.xpath('//div[@class='song']/p[3]')
# r = tree.xpath('//div[@class='tang']//li[5]/a/text()')[0]
# r = tree.xpath('//li[7]//text()')[0]
# r = tree.xpath('//div[@class='tang']//text()')
r = tree.xpath('//div[@class='song']/img/@src')
需求:爬取58二手房中的房源信息
import requests
from lxml import etree
if __name__ == "__main__":
#爬取到页面源码数据
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'https://bj.58.com/ershoufang/'
page_text = requests.get(url=url, headers=headers).text
#数据解析
tree = etree.HTML(page_text)
#存储的是li标签对象
li_list = tree.xpath('//ul[@class='house-list-wrap']/li')
fp = open('./58.txt','w',encoding='utf-8')
for li in li_list:
title = li.xpath('./div[2]/h2/a/text()')[0]
fp.write(title+'\n')
需求:解析下载图片数据
import requests
from lxml import etree
import os
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'http://pic.netbian.com/4kmeinv/'
response = requests.get(url=url, headers=headers)
#可以手动设定响应数据的编码格式
# response.encoding = 'utf-8'
page_text = response.text
#数据解析:src的属性值,alt属性值
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="clearfix"]/li')
#创建一个文件夹
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
for li in li_list:
img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
#通用处理中文乱码的解决方案
img_name = img_name.encode('iso-8859-1').decode('gbk')
#请求图片进行持久化存储
img_data = requests.get(url=img_src, headers=headers).content
img_path = 'picLibs/'+img_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name, "下载成功!")
需求:解析出所有城市名称
import requests
from lxml import etree
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
hot_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
all_city_names = []
#解析到了热门城市的城市名称
for li in hot_li_list:
hot_city_name = li.xpath('./a/text()')[0]
all_city_names.append(hot_city_name)
#解析的是全部城市的名称
city_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in city_names_list:
city_name = li.xpath('./a/text()')[0]
all_city_names.append(city_name)
print(all_city_names, len(all_city_names))
import requests
from lxml import etree
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
#解析到热门城市和所有城市对应的a标签
#//div[@class="bottom"]/ul/li/a:热门城市a标签的层级关系
#//div[@class="bottom"]/ul/div[2]/li/a:全部城市a标签的层级关系
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
all_city_names = []
for a in a_list:
city_name = a.xpath('./text()')[0]
all_city_names.append(city_name)
print(all_city_names, len(all_city_names))
需求:爬取站长素材中免费简历模板
import requests
from lxml import etree
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = 'http://sc.chinaz.com/jianli/free.html'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
detail_url_list = []
for div in div_list:
detail_url = div.xpath('./a/@href')[0]
detail_url_list.append(detail_url)
# print(detail_url)
for url in detail_url_list:
detail_page_text = requests.get(url=detail_url, headers=headers).text
detail_tree = etree.HTML(detail_page_text)
li_list = detail_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li')
for li in li_list:
file_url = li.xpath('./a/@href')[0]
# print(file_url)
本文章使用limfx的vsocde插件快速发布