线程池-协程-aiohttp-selenium

# 1.单线程+多任务异步协程

# 1.1 提高爬虫效率方式

开启线程池

from multiprocessing.dummy import Pool
import requests
import time

#异步代码
start = time.time()
pool = Pool(3)
urls = ['http://127.0.0.1:5000/bobo','http://127.0.0.1:5000/jay','http://127.0.0.1:5000/tom']
def get_request(url):
    return requests.get(url).text

response_list = pool.map(get_request,urls) #第二哥参数是一个列表,然后会遍历处理,返回是一个列表
print(response_list)

#解析,再次开启线程池
def parse(page_text):
    print(len(page_text))

pool.map(parse,response_list)
print('总耗时:',time.time()-start)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

单线程+多任务异步协程**(推荐方式)**
- **协程: **可以看成是一个对象.也可以把协程当做是一个特殊的函数.如果一个函数的定义被 async 关键字所修饰.该特殊的函数被调用后函数内部的程序语句不会被立即执行,而是会返回一个协程对象.
任务对象(task):所谓的任务对象就是对协程对象的进一步封装.在任务对象中可以实现显示协程对象的运行状况. 任务对象最终是需要被注册到事件循环对象中. - 绑定回调:回调函数是绑定给任务对象,只有当任务对象对应的特殊函数被执行完毕后,回调函数才会被执行.
事件循环对象:无限循环的对象.也可以把其当成是某一种容器.该容器中需要放置多个任务对象(就是一组待执行的代码块). - 异步的体现:当事件循环开启后,该对象会安装顺序执行每一个任务对象, 当一个任务对象发生了阻塞事件循环是不会等待,而是直接执行下一个任务对象
- await:挂起的操作.交出 cpu 的使用权

多任务的异步协程

注意事项:
- 1.将多个任务对象存储到一个列表中,然后将该列表注册到事件循环中.在注册的过程中,该列表需要被 wait方法进行处理.
- 2.在任务对象对应的特殊函数内部的实现中,不可以出现不支持异步模块的代码,否则就会中断整个的异步效果.并且,在该函数内部每一组阻塞的操作都必须使用await关键字进行修饰.
- 3.requests模块对应的代码不可以出现在特殊函数内部,因为requests是一个不支持异步的模块

import asyncio
from time import sleep
import time

start = time.time()
urls = [
    'http://localhost:5000/bobo',
    'http://localhost:5000/bobo',
]

# 回调函数:
# 默认参数:任务对象
def callback(task):
    print('i am callback!!1')
    print(task.result())  # result返回的就是任务对象对应的那个特殊函数的返回值

# 在待执行的代码块中不可以出现不支持异步模块的代码
# 在该函数内部如果有阻塞操作必须使用await关键字进行修饰
async def get_request(url):
    print('正在请求:', url)
    await asyncio.sleep(2)
    print('请求结束:', url)
    return 'hello bobo'


tasks = []  # 放置所有的任务对象
for url in urls:
    # 创建一个协程对象
    c = get_request(url)
    # 封装一个任务对象
    task = asyncio.ensure_future(c)
    # 给任务对象绑定回调函数
    tasks.append(task)
# 创建一个第三方循环事件
loop = asyncio.get_event_loop()

# 将任务对象注册到事件循环对象中并且开启了事件循环
loop.run_until_complete(asyncio.wait(tasks))

print(time.time() - start)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

# 2.aiohttp 使用

aiohttp:支持异步操作的网络请求的模块
- 环境安装:pip install aiohttp

初步的架构: async def req(url): with aiohttp.ClientSessio() as s: with s.get(url) as response: #response.read():byte page_text = response.text() return page_text

补充细节:在每一个with前面加上async,在每一步的阻塞操作前加上await async def req(url): async with aiohttp.ClientSessio() as s: async with await s.get(url) as response: #response.read():byte page_text = await response.text() return page_text

import asyncio
import requests
import time
import aiohttp
from lxml import etree

urls = [
    'http://localhost:5000/bobo',
    'http://localhost:5000/bobo',
]


# 无法实现异步的效果:是因为requests模块是一个不支持异步的模块
async def req(url):
    async with aiohttp.ClientSession() as s:
        async with await s.get(url) as response:
            # response.read():byte # 读取字节类型
            page_text = await response.text()
            return page_text  # 将返回值返回给回调函数
    # 细节:在每一个with前面加上async,在每一步的阻塞操作前加上await

def parse(task):
    page_text = task.result()
    tree = etree.HTML(page_text)
    name = tree.xpath('//p/text()')[0]
    print(name)

if __name__ == '__main__':
    start = time.time()
    tasks = []
    for url in urls:
        # 创建一个协程对象
        c = req(url)
        # 封装一个任务对象
        task = asyncio.ensure_future(c)
        # # 给任务对象绑定回调函数
        task.add_done_callback(parse)
        tasks.append(task)

    # 创建一个事件循环对象
    loop = asyncio.get_event_loop()
    # 将任务对象注册到事件循环对象中并且开启了事件循环
    loop.run_until_complete(asyncio.wait(tasks))
    print(time.time() - start)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

# 3.selenium 模块使用

概念:基于浏览器自动化的一个模块,可以模拟浏览器行为
环境的安装:下载selenium模块
selenium和爬虫之间的关联是什么?
- 便捷的获取页面中动态加载的数据
  - requests模块进行数据爬取:可见非可得
  - selenium:可见即可得
- 实现模拟登录
基本操作:
- 谷歌浏览器驱动程序下载地址:http://chromedriver.storage.googleapis.com/index.html
- 驱动程序和谷歌版本的映射关系表:https://blog.csdn.net/huilan_same/article/details/51896672
- 1.实例化某一款浏览器对象(驱动程序的路径)
- 2.find系列的函数用作于标签定位
动作链:一系列的行为动作
无头浏览器:无可视化界面的浏览器.
- phantomJS

# 3.1 百度搜索"美女",代码演示

from selenium import webdriver
from time import sleep
# 后面是你的浏览器驱动位置，记得前面加r'','r'是防止字符转义的
driver = webdriver.Chrome(r'D:\教学视频\python 爬虫\tools\chromedriver.exe') # 浏览器驱动路径
# 用get打开百度页面
driver.get("http://www.baidu.com")
# 查找页面的“设置”选项，并进行点击
sleep(1)
driver.find_elements_by_link_text('设置')[0].click()
sleep(2)
# 打开设置后找到“搜索设置”选项，设置为每页显示50条
driver.find_elements_by_link_text('搜索设置')[0].click()
sleep(2)
# 选中每页显示50条
m = driver.find_element_by_id('nr')
sleep(2)
m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
m.find_element_by_xpath('.//option[3]').click()
sleep(2)
# 点击保存设置
driver.find_elements_by_class_name("prefpanelgo")[0].click()
sleep(2)
# 处理弹出的警告页面   确定accept() 和 取消dismiss()
driver.switch_to_alert().accept()
sleep(2)
# 找到百度的输入框，并输入 美女
driver.find_element_by_id('kw').send_keys('美女')
sleep(2)
# 点击搜索按钮
driver.find_element_by_id('su').click()
sleep(2)
# 在打开的页面中找到“Selenium - 开源中国社区”，并打开这个页面
driver.find_elements_by_link_text('美女_百度图片')[0].click()
sleep(3)
# 关闭浏览器
driver.quit()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

# 3.2 selenium的基本操作

from selenium import webdriver
from time import sleep
#实例化一个浏览器对象
bro = webdriver.Chrome(executable_path=r'C:\Users\oldboy-python\Desktop\爬虫+数据\day04\chromedriver.exe')
url = 'https://www.jd.com/'
bro.get(url) #用户发起请求
#定位标签
search_input = bro.find_element_by_id('key')
#对指定标签进行数据交互
search_input.send_keys('macPro')
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
sleep(2)
#执行js代码
jsCode = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(jsCode)
sleep(3)
bro.quit()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# 3.3 seleniu爬取药监总局数据

# 爬取前两页,爬取动态获取到额数据
from selenium import webdriver
from lxml import etree
from time import sleep
#实例化一个浏览器对象
page_text_list = []
bro = webdriver.Chrome(executable_path=r'./chromedriver.exe')
url = 'http://125.35.6.84:81/xk/'
bro.get(url)
sleep(2)
#page_source返回的就是当前浏览器打卡页面对应的页面源码数据
page_text = bro.page_source
page_text_list.append(page_text)

for i in range(2):
    bro.find_element_by_id('pageIto_next').click()
    sleep(2)
    page_text = bro.page_source
    page_text_list.append(page_text)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="gzlist"]/li')
    for li in li_list:
        name = li.xpath('./dl/@title')[0]
        print(name)
sleep(3)
bro.quit()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# 3.4 selenium 动作链

from lxml import etree
from time import sleep
from selenium import webdriver
from selenium.webdriver import ActionChains

# 实例化一个浏览器对象
page_text_list = []
bro = webdriver.Chrome(executable_path=r'./chromedriver.exe')
url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
bro.get(url)
# 如果定位的标签是存在于iframe对应的子页面中的话,在进行标签定位前一定要执行一个switch_to的操作
bro.switch_to.frame('iframeResult')
div_tag = bro.find_element_by_id('draggable')

# 1.实例化动作链对象
action = ActionChains(bro)
action.click_and_hold(div_tag)

for i in range(5):
    # .perform()方法是让动作链立即执行
    action.move_by_offset(17, 0).perform()
    sleep(0.5)
action.release()  # 释放
sleep(3)
bro.quit()  # 关闭浏览器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

# 3.5 无头浏览器

无头浏览器是为了用户使用过程中弹出浏览器自动操作:使用chorm浏览器的无头模式

from selenium.webdriver.chrome.options import Options
from time import sleep
from selenium import webdriver

# 创建一个参数对象，用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

# 实例化一个浏览器对象
bro = webdriver.Chrome(executable_path=r'./chromedriver.exe', chrome_options=chrome_options)
bro.get('https://www.baidu.com')
sleep(2)
bro.save_screenshot('1.png')
print(bro.page_source)
sleep(2)
bro.quit()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

# 3.6 selenium 规避风险

某些网站存在selenium检测

from time import sleep
from selenium import webdriver
from selenium.webdriver import ChromeOptions
# 实例化一个对象规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

#实例化一个浏览器对象
bro = webdriver.Chrome(executable_path=r'./chromedriver.exe',options=option)
bro.get('https://www.taobao.com/')

1
2
3
4
5
6
7
8
9
10
11

# 3.7 12306 模拟登录

使用截图,坐标定位,点击动作链技术

from selenium import webdriver
from selenium.webdriver import ActionChains
from PIL import Image  # 用作于图片的裁剪
from ChaoJiYing import Chaojiying_Client
from time import sleep
bro = webdriver.Chrome(executable_path=r'./chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(5)
# 验证码图片进行捕获(裁剪)
bro.save_screenshot('main.png')
# 定位到了验证码图片对应的标签
code_img_ele = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img_ele.location  # 验证码图片基于当前整张页面的左下角坐标
size = code_img_ele.size  # 验证码图片的长和宽
# 裁剪的矩形区域(左下角和右上角两点的坐标)
rangle = (
int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
i = Image.open('main.png')
frame = i.crop(rangle)
frame.save('code.png')
# 使用打码平台进行验证码的识别
chaojiying = Chaojiying_Client('bobo328410948', 'bobo328410948', '899370')  # 用户中心>>软件ID 生成一个替换 96001
im = open('code.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
result = chaojiying.PostPic(im, 9004)['pic_str']
print(result)  # x1,y1|x2,y2|x3,y3  ==> [[x1,y1],[x2,y2],[x3,y3]]
all_list = []  # [[x1,y1],[x2,y2],[x3,y3]] 每一个列表元素表示一个点的坐标,坐标对应值的0,0点是验证码图片左下角
if '|' in result:
    list_1 = result.split('|')
    count_1 = len(list_1)
    for i in range(count_1):
        xy_list = []
        x = int(list_1[i].split(',')[0])
        y = int(list_1[i].split(',')[1])
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
else:
    x = int(result.split(',')[0])
    y = int(result.split(',')[1])
    xy_list = []
    xy_list.append(x)
    xy_list.append(y)
    all_list.append(xy_list)
# action = ActionChains(bro)
for l in all_list:
    x = l[0]
    y = l[1]
    ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()
    sleep(1)
sleep(3)
bro.quit()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

#Python #

上次更新: 2023/04/16, 18:35:33

← 代理-cookie-验证码识别-模拟登录移动端数据爬取-scrapy框架→