Код IT
← Каталог

Работа с файлами, сетью и внешними API — Парсинг и веб-скрапинг

Фрагмент из «Работа с файлами, сетью и внешними API»: Парсинг и веб-скрапинг.

Python main.py

import requests

from bs4 import BeautifulSoup

import csv

def scrape_prices(url):
    headers = {'User-Agent': 'Data Scraper 1.0'}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
    except requests.RequestException as e:
        print(f"Не удалось загрузить страницу: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    products = []

    for item in soup.find_all('div', class_='product-item'):
        title_elem = item.find('h3', class_='title')
        price_elem = item.find('span', class_='price')
        link_elem = item.find('a', href=True)

        if not all([title_elem, price_elem, link_elem]):
            continue  # Пропускаем повреждённые карточки

        title = title_elem.get_text(strip=True)
        price_text = price_elem.get_text(strip=True).replace('₽', '').replace(' ', '')
        
        try:
            price = float(price_text)
        except ValueError:
            price = None

        link = link_elem['href']
        if not link.startswith('http'):
            link = 'https://shop.example.com' + link  # Формируем полный URL

        products.append({
            'title': title,
            'price': price,
            'url': link
        })

    return products

# Запуск
data = scrape_prices('https://shop.example.com/category/laptops')

# Сохранение в CSV
with open('prices.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'price', 'url'])
    writer.writeheader()
    writer.writerows(data)

import requests

from bs4 import BeautifulSoup

import csv

def scrape_prices(url):
    headers = {'User-Agent': 'Data Scraper 1.0'}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
    except requests.RequestException as e:
        print(f"Не удалось загрузить страницу: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    products = []

    for item in soup.find_all('div', class_='product-item'):
        title_elem = item.find('h3', class_='title')
        price_elem = item.find('span', class_='price')
        link_elem = item.find('a', href=True)

        if not all([title_elem, price_elem, link_elem]):
            continue  # Пропускаем повреждённые карточки

        title = title_elem.get_text(strip=True)
        price_text = price_elem.get_text(strip=True).replace('₽', '').replace(' ', '')
        
        try:
            price = float(price_text)
        except ValueError:
            price = None

        link = link_elem['href']
        if not link.startswith('http'):
            link = 'https://shop.example.com' + link  # Формируем полный URL

        products.append({
            'title': title,
            'price': price,
            'url': link
        })

    return products

# Запуск
data = scrape_prices('https://shop.example.com/category/laptops')

# Сохранение в CSV
with open('prices.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'price', 'url'])
    writer.writeheader()
    writer.writerows(data)