PDF Scraping: Trích Xuất Dữ Liệu Từ File PDF

Trở lại Tin tức
Tin tức

PDF Scraping: Trích Xuất Dữ Liệu Từ File PDF

PDF chứa nhiều data quý giá nhưng khó extract. Bài viết hướng dẫn scrape data từ PDFs hiệu quả.

Use Cases

  • Financial reports: Annual reports, statements
  • Research papers: Academic data
  • Government docs: Public records
  • Invoices: Business automation
  • Contracts: Legal text extraction

1. PyPDF2 – Basic Text

from PyPDF2 import PdfReader

def extract_text_pypdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    
    for page in reader.pages:
        text += page.extract_text() + "\n"
    
    return text

# Usage
text = extract_text_pypdf('report.pdf')
print(text)

2. pdfplumber – Better Formatting

import pdfplumber

def extract_with_pdfplumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Extract tables
def extract_tables(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_tables = page.extract_tables()
            tables.extend(page_tables)
    return tables

tables = extract_tables('financial_report.pdf')
for table in tables:
    print(table)

3. Camelot – Table Extraction

import camelot

# Extract tables from PDF
tables = camelot.read_pdf('report.pdf', pages='1-5')

print(f"Found {len(tables)} tables")

# Convert to pandas DataFrame
for i, table in enumerate(tables):
    df = table.df
    df.to_csv(f'table_{i}.csv', index=False)
    print(df.head())

4. Tabula – Java-based Tables

import tabula

# Extract all tables
tables = tabula.read_pdf('report.pdf', pages='all')

# Extract specific area
tables = tabula.read_pdf(
    'report.pdf',
    area=[100, 50, 500, 550],  # top, left, bottom, right
    pages='1'
)

# Convert to CSV
tabula.convert_into('report.pdf', 'output.csv', output_format='csv')

5. OCR Cho Scanned PDFs

import pytesseract
from pdf2image import convert_from_path

def ocr_pdf(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    
    text = ""
    for i, image in enumerate(images):
        # OCR each page
        page_text = pytesseract.image_to_string(image, lang='vie')
        text += f"--- Page {i+1} ---\n{page_text}\n"
    
    return text

# For Vietnamese PDFs
text = ocr_pdf('scanned_document.pdf')

6. Download PDFs From Web

import requests
from urllib.parse import urljoin

def download_pdfs(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    pdfs = []
    for link in soup.select('a[href$=".pdf"]'):
        pdf_url = urljoin(page_url, link['href'])
        pdfs.append(pdf_url)
    
    for pdf_url in pdfs:
        filename = pdf_url.split('/')[-1]
        pdf_response = requests.get(pdf_url)
        with open(filename, 'wb') as f:
            f.write(pdf_response.content)
        print(f"Downloaded: {filename}")

download_pdfs('https://company.com/reports')

7. Structured Data Extraction

import re

def extract_financial_data(text):
    data = {}
    
    # Revenue
    revenue_match = re.search(r'Revenue[:\s]+\$?([\d,]+)', text)
    if revenue_match:
        data['revenue'] = int(revenue_match.group(1).replace(',', ''))
    
    # Net Income
    income_match = re.search(r'Net Income[:\s]+\$?([\d,]+)', text)
    if income_match:
        data['net_income'] = int(income_match.group(1).replace(',', ''))
    
    return data

text = extract_text_pypdf('annual_report.pdf')
financial = extract_financial_data(text)

VinaProxy + PDF Scraping

  • Download PDFs từ protected sources
  • Bypass geographic restrictions
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →