认识数据筛选
常见的数据筛选
import re
import requests
import os
# 爬取图片
if __name__ == "__main__":
# 创建一个文件夹,用来保存所有的图片
if not os.path.exists('./imgLibs'):
os.mkdir('./imgLibs')
url = 'https://www.douban.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# 使用通用爬虫对url对应的一整张页面进行爬取
page_text = requests.get(url=url, headers=headers).text
# 使用聚焦爬虫将页面中所有的图片进行解析、提取
ex = '<div class="pic">.*?<img src=.*? data-origin="(.*?)" alt=.*?</div>'
img_src_list = re.findall(ex, page_text, re.S)
# print(img_src_list)
for src in img_src_list:
# 将图片信息以二进制存储
img_data = requests.get(url=src, headers=headers).content
# 生成图片名称
img_name = src.split('/')[-1]
imgPath = './imgLibs/' + img_name
with open(imgPath, 'wb') as fp:
fp.write(img_data)
print(img_name, '下载成功')
       
Bs4基础
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
# 将本地的html文件中的数据加载到该对象中
fp = open('test.html', 'r', encoding='utf-8')
soup = BeautifulSoup(fp, 'lxml')
# print(soup)
# print(soup.a) # soup.tagName 返回的是html中第一次出现的tagName标签
# print(soup.find('div')) # 相当于soup.div
# print(soup.find('div', class_='song'))
# print(soup.find_all('a'))
# print(soup.select('.tang'))
# print(soup.select('.tang > ul > li > a')[0])
# print(soup.select('.tang > ul a')[0])
# print(soup.select('.tang > ul a')[0].text)
tag = soup.find('div', class_='song')
print(tag.text)
       
xpath基础
from lxml import etree
if __name__ == "__main__":
# 实例化一个etree对象
tree = etree.parse('./test.html')
# r = tree.xpath('/html//title')
# r = tree.xpath('//div[@class="song"]')
# r = tree.xpath('//div[@class="song"]/p[3]')
# r = tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()')
# r = tree.xpath('//div[@class="tang"]//text()')
# r = tree.xpath('//div[@class="song"]/img/@src')
r = tree.xpath('//div[@class="song"]/p/text()')
print(r)
       
|