Python提取网页中所有的img链接

Tags: /Python/ Date Created:

<?php

if ($argc != 2) {
    echo "Usage: php script.php <url>\n";
    exit(1);
}

$url = $argv[1];

// 设置模拟 Chrome 的 User-Agent
$options = [
    'http' => [
        'header' => 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    ],
];

$context = stream_context_create($options);

try {
    // 获取网页内容
    $html = file_get_contents($url, false, $context);
} catch (Exception $e) {
    echo "Error fetching the URL: {$e->getMessage()}\n";
    exit(1);
}

// 定义匹配img标签的正则表达式
$pattern = '/<img[^>]*\bsrc=["\']([^"\']+)/i';

// 执行正则匹配
preg_match_all($pattern, $html, $matches);

// 提取匹配结果
$imgLinks = $matches[1];

// 过滤掉base64编码的图片链接和非http(s)开头的链接
$imgLinks = array_filter($imgLinks, function($link) {
    return strpos($link, 'data:image') !== 0 && (strpos($link, 'http://') === 0 || strpos($link, 'https://') === 0);
});

// 输出所有链接，每行一个
foreach ($imgLinks as $link) {
    echo $link . "\n";
}
?>

Python代码：

import sys
import re
import requests

def get_img_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    try:
        # 获取网页内容
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        html = response.text
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        sys.exit(1)

    # 定义匹配img标签的正则表达式
    pattern = r'<img[^>]*\bsrc=["\']([^"\']+)'

    # 执行正则匹配
    matches = re.findall(pattern, html)

    # 过滤掉base64编码的图片链接和非http(s)开头的链接
    img_links = [link for link in matches if not link.startswith('data:image') and (link.startswith('http://') or link.startswith('https://'))]

    return img_links

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <url>")
        sys.exit(1)

    url = sys.argv[1]
    img_links = get_img_links(url)

    # 输出所有链接，每行一个
    for link in img_links:
        print(link)