95-Python爬虫-正则表达式

1.使用search()方法匹配字符串

bash 复制代码

# -*- coding: utf-8 -*-
"""
@Project : 01-python-learn
@File    : demo09.py
@IDE     : PyCharm
@Author  : 刘庆东
@Date    : 2025/11/19 9:07

re模块中的
使用search()方法匹配字符串
"""
import re
pattern = 'mr_\w+'
#声明字符串
string='MR_SHOP mr_shop'

#搜索字符串不区分大小写
match=re.search(pattern, string,re.I)
print(match)

#声明字符串
string='项目名称是 mr_shop'

#搜索字符串不区分大小写
match=re.search(pattern, string,re.I)
print(match)


# -*- coding: utf-8 -*-
"""
@Project : 01-python-learn
@File    : demo10.py
@IDE     : PyCharm
@Author  : 刘庆东
@Date    : 2025/11/19 9:13
可选匹配
"""
import re
"""
(\d?)+多个数字可有可无 
\s?空格可有可无
([\u4e00-\u9fa5]?)多个汉字可有可无
"""

pattern = '(\d?)+mrsoft\s?([\u4e00-\u9fa5]?)+'

match=re.search(pattern,'01mrsoft')
print(match)

match=re.search(pattern,'mrsoft')
print(match)

match=re.search(pattern,'mrsoft 第一')
print(match)

import re
pattern = r'bmr\b'

match=re.search(pattern,'mrsoft')
print(match)


match=re.search(pattern,'mr soft')
print(match)


match=re.search(pattern,' mrsoft')
print(match)

2.使用findall()方法匹配字符串

bash 复制代码

# import re
# pattern = 'mr_\w+'
# string='MR_SHOP mr_shop'
# # 匹配mr_开头的字符串
# match=re.findall(pattern,string,re.I)
# print(match)

#贪婪匹配
import re
pattern = 'https://.*/'

# 匹配mr_开头的字符串
#match=re.findall(pattern,'https://www.hao123.com/')
match=re.findall(pattern,'https://www.baidu.com/')
print(match)

pattern = 'https://(.*/)'

# 匹配mr_开头的字符串
match=re.findall(pattern,'https://www.hao123.com/')
print(match)

"""
['https://www.baidu.com/']
['www.hao123.com/']
"""

3.字符串的处理

bash 复制代码

# -*- coding: utf-8 -*-
"""
@Project : 01-python-learn
@File    : demo13.py
@IDE     : PyCharm
@Author  : 刘庆东
@Date    : 2025/11/19 9:50
"""
# import re
# #定义要退换的模式字符串
# pattern = r'1[34578]\d{9}'
# #隐藏中奖的手机号码
# string='中奖号码为:68932756 联系电话为:13612345678'
# result=re.sub(pattern,'1XXXXXXXXXX',string)
# print(result)

# import re
#
# string='John,I like you to meet Mr. Wang，Mr. Wang, this is our Sales Manager John. John, this is Mr. Wang.'
#
# pattern='Wang'
#
# match=re.subn(pattern,'Liushao',string)
#
# print(match)

"""
输出结果
('John,I like you to meet Mr. Liushao，Mr. Liushao, this is our Sales Manager John. John, this is Mr. Liushao.', 3)
"""

# import re
# string='hk400 jhkj6h7k5 jhkjhk1j0k66'
#
# pattern='[a-z]'
#
# match=re.subn(pattern,'',string,flags=re.I)
#
# print(match)

"""
输出结果
('400 675 1066', 16)
"""


#分隔字符串
import re
string='预定|K7577|CCT|THL|CCT|LYL|15:47|16:51|02:05|Y|'

pattern='\|'

match=re.split(pattern,string,maxsplit=1)
print(match)

4.爬虫爬qq音乐信息

bash 复制代码

# -*- coding: utf-8 -*-
"""
@Project : 01-python-learn
@File    : demo14.py
@IDE     : PyCharm
@Author  : 刘庆东
@Date    : 2025/11/19 10:03
"""

import requests #导入requests模块
import re

def send_request(url,headers):
    #发送网络请求
    response = requests.get(url=url,headers=headers)
    #如果请求成功了
    if response.status_code == 200:
        #返回html代码
        return response.text

def interperting_data(html_text):
    # 正则匹配歌名
    names = re.findall('<a title=".*?" href=".*?">(.*?)</a>', html_text)
    # 正则匹配歌手
    singers = re.findall('<div class="songlist__artist"><a class="playlist__author" title=".*?" href=".*?">(.*?)</a>',
                         html_text)
    # 正则匹配歌曲时长
    time_len = re.findall('<div class="songlist__time">(.*?)</div>', html_text)
    #遍历数据打印信息
    for n,s,t in zip(names,singers,time_len):
        print(n)
        print(s)
        print(t)

if __name__ == '__main__':
    url='https://y.qq.com/n/ryqq/toplist/26'
    #定义请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36'}
    #发送网络请求
    html_text = send_request(url,headers)
    #解析数据
    interperting_data(html_text)


"""
SAUCE
刘耀文
02:49
恋人
李荣浩
04:35
街角的晚风 (粤语版)
善宇
02:35
Ketchup And Lemonade (NINGNING Solo)
aespa (에스파)
04:08
唯一
G.E.M. 邓紫棋
04:13
到时说爱我
茜拉 (Shila Amzah)
03:21
离开我的依赖
王艳薇
03:53
爱错
王力宏
03:58
我想要占据你
告五人
04:12
GOOD STUFF (KARINA Solo)
aespa (에스파)
02:55
我怀念的<span class="songlist__song_txt">《第二回合我爱你（幸运日)》电视剧主题曲</span>
孙燕姿
04:49
其实<span class="songlist__song_txt">《妈妈像花儿一样》电视剧插曲</span>
薛之谦
04:02
现在那边是几点
黄小琥
04:28
特别的人
方大同
04:19
两三句 (趁着灯火不算清晰)
宋佳野
03:39
Always Online<span class="songlist__song_txt">联想idea Pad S9/S10笔记本主题曲</span>
林俊杰
03:45
江南
林俊杰
04:27
多远都要在一起
G.E.M. 邓紫棋
03:37
红色高跟鞋<span class="songlist__song_txt">《爱情左右》电影主题曲</span>
蔡健雅
03:26
未出现传闻 (那曾经为你熬的夜失的眠冒的险)
Bao小易
04:04

"""