python中re模块详细教程

原创已于 2025-09-05 22:21:36 修改 · 839 阅读

10 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#python #开发语言 #爬虫

于 2025-08-10 20:59:46 首次发布

Python 正则表达式 re 模块语法参考

re模块概述

Python的re模块提供了正则表达式的支持，是处理字符串模式匹配的强大工具。

导入模块

import re

基本函数

1. re.match()

从字符串开头匹配模式

import re

# 基本用法
result = re.match(r'hello', 'hello world')
print(result.group())  # 输出: hello

# 使用组
result = re.match(r'(\w+) (\w+)', 'hello world')
print(result.groups())  # 输出: ('hello', 'world')
print(result.group(1))  # 输出: hello
print(result.group(2))  # 输出: world

2. re.search()

在字符串中搜索模式

import re

# 基本搜索
result = re.search(r'\d+', 'abc123def')
print(result.group())  # 输出: 123

# 搜索邮箱
text = "联系我：user@example.com 或 admin@test.org"
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
result = re.search(email_pattern, text)
print(result.group())  # 输出: user@example.com

3. re.findall()

查找所有匹配项

import re

# 查找所有数字
text = "价格是123元，数量是456个"
numbers = re.findall(r'\d+', text)
print(numbers)  # 输出: ['123', '456']

# 查找所有邮箱
text = "联系我：user@example.com 或 admin@test.org"
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
print(emails)  # 输出: ['user@example.com', 'admin@test.org']

4. re.finditer()

返回迭代器，包含所有匹配对象

import re

text = "价格是123元，数量是456个"
for match in re.finditer(r'\d+', text):
    print(f"找到数字: {match.group()} 位置: {match.span()}")
# 输出:
# 找到数字: 123 位置: (3, 6)
# 找到数字: 456 位置: (10, 13)

5. re.sub()

替换匹配的字符串

import re

# 基本替换
text = "hello world"
new_text = re.sub(r'world', 'python', text)
print(new_text)  # 输出: hello python

# 使用函数进行替换
def replace_func(match):
    return str(int(match.group()) * 2)

text = "价格是123元，数量是456个"
new_text = re.sub(r'\d+', replace_func, text)
print(new_text)  # 输出: 价格是246元，数量是912个

6. re.split()

根据模式分割字符串

import re

# 按数字分割
text = "abc123def456ghi"
parts = re.split(r'\d+', text)
print(parts)  # 输出: ['abc', 'def', 'ghi']

# 保留分隔符
parts = re.split(r'(\d+)', text)
print(parts)  # 输出: ['abc', '123', 'def', '456', 'ghi']

7. re.compile()

编译正则表达式对象

import re

# 编译正则表达式
pattern = re.compile(r'\d+')

# 使用编译后的对象
text = "abc123def456ghi"
result = pattern.findall(text)
print(result)  # 输出: ['123', '456']

# 使用编译后的对象进行替换
new_text = pattern.sub('NUMBER', text)
print(new_text)  # 输出: abcNUMBERdefNUMBERghi

正则表达式语法

字符类

import re

# 字符集合
pattern = r'[abc]'  # 匹配 a、b 或 c
text = "apple banana cherry"
matches = re.findall(pattern, text)
print(matches)  # 输出: ['a', 'a', 'a', 'a', 'a', 'a']

# 范围
pattern = r'[a-z]'  # 匹配小写字母
pattern = r'[A-Z]'  # 匹配大写字母
pattern = r'[0-9]'  # 匹配数字
pattern = r'[a-zA-Z0-9]'  # 匹配字母和数字

# 否定字符集
pattern = r'[^abc]'  # 匹配除了 a、b、c 之外的字符

预定义字符类

import re

# 常用预定义字符类
patterns = {
    r'\d': '数字 [0-9]',
    r'\D': '非数字 [^0-9]',
    r'\w': '单词字符 [a-zA-Z0-9_]',
    r'\W': '非单词字符 [^a-zA-Z0-9_]',
    r'\s': '空白字符 [\t\n\r\f\v]',
    r'\S': '非空白字符 [^\t\n\r\f\v]',
    r'.': '任意字符（除换行符外）'
}

text = "Hello 123 World!"
for pattern, description in patterns.items():
    matches = re.findall(pattern, text)
    print(f"{description}: {matches}")

量词

import re

# 基本量词
patterns = {
    r'a*': '匹配 a 0次或多次',
    r'a+': '匹配 a 1次或多次',
    r'a?': '匹配 a 0次或1次',
    r'a{3}': '匹配 a 恰好3次',
    r'a{3,}': '匹配 a 至少3次',
    r'a{3,5}': '匹配 a 3到5次'
}

text = "aa aaa aaaa aaaaa"
for pattern, description in patterns.items():
    matches = re.findall(pattern, text)
    print(f"{description}: {matches}")

贪婪与非贪婪

import re

text = "<div>content1</div><div>content2</div>"

# 贪婪匹配（默认）
greedy_pattern = r'<div>.*</div>'
greedy_match = re.search(greedy_pattern, text)
print("贪婪匹配:", greedy_match.group())

# 非贪婪匹配
non_greedy_pattern = r'<div>.*?</div>'
non_greedy_matches = re.findall(non_greedy_pattern, text)
print("非贪婪匹配:", non_greedy_matches)

位置锚点

import re

# 行首行尾
text = "hello\nworld\nhello python"
pattern = r'^hello'  # 匹配行首的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行首匹配:", matches)

pattern = r'hello$'  # 匹配行尾的hello
matches = re.findall(pattern, text, re.MULTILINE)
print("行尾匹配:", matches)

# 单词边界
text = "word wordly sword"
pattern = r'\bword\b'  # 匹配独立的word
matches = re.findall(pattern, text)
print("单词边界匹配:", matches)

分组和引用

import re

# 捕获组
text = "John Doe, Jane Smith"
pattern = r'(\w+) (\w+)'
matches = re.findall(pattern, text)
print("捕获组:", matches)  # 输出: [('John', 'Doe'), ('Jane', 'Smith')]

# 命名组
pattern = r'(?P<first>\w+) (?P<last>\w+)'
match = re.search(pattern, text)
if match:
    print("命名组:", match.groupdict())

# 反向引用
text = "hello hello world world"
pattern = r'(\w+) \1'  # 匹配重复的单词
matches = re.findall(pattern, text)
print("反向引用:", matches)

断言

import re

# 正向先行断言
text = "apple pie, apple juice, apple"
pattern = r'apple(?=\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("正向先行断言:", matches)  # 输出: ['apple', 'apple']

# 负向先行断言
pattern = r'apple(?!\s+(?:pie|juice))'
matches = re.findall(pattern, text)
print("负向先行断言:", matches)  # 输出: ['apple']

# 正向后行断言
text = "123apple, 456apple, apple"
pattern = r'(?<=\d+)apple'
matches = re.findall(pattern, text)
print("正向后行断言:", matches)  # 输出: ['apple', 'apple']

常用模式

邮箱验证

import re

def validate_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# 测试
emails = [
    "user@example.com",
    "invalid-email",
    "user@domain",
    "user.name@domain.co.uk"
]

for email in emails:
    print(f"{email}: {validate_email(email)}")

手机号验证（中国大陆）

import re

def validate_phone(phone):
    pattern = r'^1[3-9]\d{9}$'
    return bool(re.match(pattern, phone))

# 测试
phones = [
    "13812345678",
    "12345678901",
    "1381234567",
    "23812345678"
]

for phone in phones:
    print(f"{phone}: {validate_phone(phone)}")

身份证号验证（中国大陆）

import re

def validate_id_card(id_card):
    pattern = r'^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$'
    return bool(re.match(pattern, id_card))

# 测试
id_cards = [
    "110101199001011234",
    "123456789012345",
    "11010119900101123X"
]

for id_card in id_cards:
    print(f"{id_card}: {validate_id_card(id_card)}")

URL验证

import re

def validate_url(url):
    pattern = r'^https?://[^\s/$.?#].[^\s]*$'
    return bool(re.match(pattern, url))

# 测试
urls = [
    "https://www.example.com",
    "http://example.com/path",
    "ftp://example.com",
    "not-a-url"
]

for url in urls:
    print(f"{url}: {validate_url(url)}")

密码强度验证

import re

def validate_password_strength(password):
    # 至少8位，包含大小写字母、数字和特殊字符
    pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'
    return bool(re.match(pattern, password))

# 测试
passwords = [
    "MyPass123!",
    "weak",
    "NoSpecialChar123",
    "nouppercase123!"
]

for password in passwords:
    print(f"{password}: {validate_password_strength(password)}")

实际应用示例

1. 提取HTML标签内容

import re

def extract_html_content(html):
    pattern = r'<[^>]*>([^<]*)</[^>]*>'
    matches = re.findall(pattern, html)
    return matches

# 测试
html = "<div>Hello World</div><p>Test</p><span>Content</span>"
contents = extract_html_content(html)
print("HTML内容:", contents)

2. 格式化电话号码

import re

def format_phone_number(phone):
    # 移除所有非数字字符
    clean_phone = re.sub(r'\D', '', phone)
    
    # 格式化为中国手机号格式
    if len(clean_phone) == 11:
        return re.sub(r'(\d{3})(\d{4})(\d{4})', r'\1-\2-\3', clean_phone)
    return phone

# 测试
phones = [
    "13812345678",
    "138-1234-5678",
    "138 1234 5678",
    "138.1234.5678"
]

for phone in phones:
    formatted = format_phone_number(phone)
    print(f"{phone} -> {formatted}")

3. 解析日志文件

import re
from datetime import datetime

def parse_log_line(log_line):
    # 假设日志格式: [2023-01-01 12:00:00] INFO: User login successful
    pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.+)'
    match = re.match(pattern, log_line)
    
    if match:
        timestamp, level, message = match.groups()
        return {
            'timestamp': datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S'),
            'level': level,
            'message': message
        }
    return None

# 测试
log_lines = [
    "[2023-01-01 12:00:00] INFO: User login successful",
    "[2023-01-01 12:01:15] ERROR: Database connection failed",
    "Invalid log line"
]

for line in log_lines:
    parsed = parse_log_line(line)
    print(f"原始: {line}")
    print(f"解析: {parsed}")
    print()

4. 数据清洗

import re

def clean_text(text):
    # 移除多余的空白字符
    text = re.sub(r'\s+', ' ', text)
    
    # 移除特殊字符（保留中文、英文、数字）
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)
    
    # 移除行首行尾空白
    text = text.strip()
    
    return text

# 测试
dirty_text = "  Hello   World!  @#$%^&*()  \n\n  测试文本  "
clean = clean_text(dirty_text)
print(f"原始文本: '{dirty_text}'")
print(f"清洗后: '{clean}'")

5. 提取CSV数据

import re

def parse_csv_line(line):
    # 处理包含逗号的字段（用双引号包围）
    pattern = r'"([^"]*)"|([^,]+)'
    matches = re.findall(pattern, line)
    
    # 提取匹配的内容
    fields = []
    for match in matches:
        field = match[0] if match[0] else match[1]
        fields.append(field.strip())
    
    return fields

# 测试
csv_lines = [
    'name,age,city',
    'John,25,"New York, NY"',
    'Jane,30,"Los Angeles, CA"',
    'Bob,35,Chicago'
]

for line in csv_lines:
    fields = parse_csv_line(line)
    print(f"原始: {line}")
    print(f"解析: {fields}")
    print()

性能优化

1. 使用编译后的正则表达式

import re
import time

# 未编译版本
def test_uncompiled(text, iterations=10000):
    start = time.time()
    for _ in range(iterations):
        re.findall(r'\d+', text)
    return time.time() - start

# 编译版本
def test_compiled(text, iterations=10000):
    pattern = re.compile(r'\d+')
    start = time.time()
    for _ in range(iterations):
        pattern.findall(text)
    return time.time() - start

# 测试性能
text = "abc123def456ghi789" * 100
uncompiled_time = test_uncompiled(text)
compiled_time = test_compiled(text)

print(f"未编译时间: {uncompiled_time:.4f}秒")
print(f"编译时间: {compiled_time:.4f}秒")
print(f"性能提升: {uncompiled_time/compiled_time:.2f}倍")

2. 避免回溯灾难

import re

# 不好的写法（可能导致回溯灾难）
bad_pattern = r'(a+)+b'

# 好的写法
good_pattern = r'a+b'

# 测试
text = 'a' * 1000 + 'b'

try:
    # 这可能会很慢
    re.match(bad_pattern, text)
except:
    print("回溯灾难发生")

# 这个会很快
re.match(good_pattern, text)

常见问题

1. 转义字符问题

import re

# 错误：忘记转义
# pattern = r'\d+'  # 在字符串中需要双反斜杠

# 正确：使用原始字符串
pattern = r'\d+'

# 或者使用双反斜杠
pattern = '\\d+'

2. 贪婪匹配问题

import re

text = "<div>content1</div><div>content2</div>"

# 贪婪匹配（可能不是期望的结果）
greedy = re.findall(r'<div>.*</div>', text)
print("贪婪匹配:", greedy)

# 非贪婪匹配（通常更符合期望）
non_greedy = re.findall(r'<div>.*?</div>', text)
print("非贪婪匹配:", non_greedy)

3. 多行模式

import re

text = """line1
line2
line3"""

# 不使用多行模式
matches = re.findall(r'^line', text)
print("不使用多行模式:", matches)

# 使用多行模式
matches = re.findall(r'^line', text, re.MULTILINE)
print("使用多行模式:", matches)

4. 大小写敏感

import re

text = "Hello WORLD"

# 区分大小写（默认）
matches = re.findall(r'hello', text)
print("区分大小写:", matches)

# 忽略大小写
matches = re.findall(r'hello', text, re.IGNORECASE)
print("忽略大小写:", matches)