PDF Scraping: Trích Xuất Dữ Liệu Từ File PDF
PDF chứa nhiều data quý giá nhưng khó extract. Bài viết hướng dẫn scrape data từ PDFs hiệu quả.
Use Cases
- Financial reports: Annual reports, statements
- Research papers: Academic data
- Government docs: Public records
- Invoices: Business automation
- Contracts: Legal text extraction
1. PyPDF2 – Basic Text
from PyPDF2 import PdfReader
def extract_text_pypdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
# Usage
text = extract_text_pypdf('report.pdf')
print(text)
2. pdfplumber – Better Formatting
import pdfplumber
def extract_with_pdfplumber(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
# Extract tables
def extract_tables(pdf_path):
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
tables.extend(page_tables)
return tables
tables = extract_tables('financial_report.pdf')
for table in tables:
print(table)
3. Camelot – Table Extraction
import camelot
# Extract tables from PDF
tables = camelot.read_pdf('report.pdf', pages='1-5')
print(f"Found {len(tables)} tables")
# Convert to pandas DataFrame
for i, table in enumerate(tables):
df = table.df
df.to_csv(f'table_{i}.csv', index=False)
print(df.head())
4. Tabula – Java-based Tables
import tabula
# Extract all tables
tables = tabula.read_pdf('report.pdf', pages='all')
# Extract specific area
tables = tabula.read_pdf(
'report.pdf',
area=[100, 50, 500, 550], # top, left, bottom, right
pages='1'
)
# Convert to CSV
tabula.convert_into('report.pdf', 'output.csv', output_format='csv')
5. OCR Cho Scanned PDFs
import pytesseract
from pdf2image import convert_from_path
def ocr_pdf(pdf_path):
# Convert PDF to images
images = convert_from_path(pdf_path)
text = ""
for i, image in enumerate(images):
# OCR each page
page_text = pytesseract.image_to_string(image, lang='vie')
text += f"--- Page {i+1} ---\n{page_text}\n"
return text
# For Vietnamese PDFs
text = ocr_pdf('scanned_document.pdf')
6. Download PDFs From Web
import requests
from urllib.parse import urljoin
def download_pdfs(page_url):
response = requests.get(page_url)
soup = BeautifulSoup(response.text, 'lxml')
pdfs = []
for link in soup.select('a[href$=".pdf"]'):
pdf_url = urljoin(page_url, link['href'])
pdfs.append(pdf_url)
for pdf_url in pdfs:
filename = pdf_url.split('/')[-1]
pdf_response = requests.get(pdf_url)
with open(filename, 'wb') as f:
f.write(pdf_response.content)
print(f"Downloaded: {filename}")
download_pdfs('https://company.com/reports')
7. Structured Data Extraction
import re
def extract_financial_data(text):
data = {}
# Revenue
revenue_match = re.search(r'Revenue[:\s]+\$?([\d,]+)', text)
if revenue_match:
data['revenue'] = int(revenue_match.group(1).replace(',', ''))
# Net Income
income_match = re.search(r'Net Income[:\s]+\$?([\d,]+)', text)
if income_match:
data['net_income'] = int(income_match.group(1).replace(',', ''))
return data
text = extract_text_pypdf('annual_report.pdf')
financial = extract_financial_data(text)
VinaProxy + PDF Scraping
- Download PDFs từ protected sources
- Bypass geographic restrictions
- Giá chỉ $0.5/GB
