View Single Post
Denne skal holde styr på hvilke filer som har blitt lastet ned, samt sjekke om de er tomme eller ei. Hvis du kjører koden en annen dag, vil den fortsette hvor den slapp. Den kjører også 100 threads samtidig, så 100 filer samtidig. Kan være servereier får DDOS advarsel pga. av det, men det får du justere selv.

Kode

import requests
import sqlite3
import os
import PyPDF2
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to download a single PDF
def download_pdf(i):
    url = f'https://eksempel.org/{i}.pdf'
    
    # Check if the PDF has already been downloaded
    c.execute("SELECT * FROM pdfs WHERE id=?", (i,))
    data = c.fetchone()
    if data is not None and data[1] == 1:
        print(f"Skipping {url}, already downloaded.")
        return

    # Download the PDF
    response = requests.get(url, stream=True)
    if response.status_code == 404:
        print(f"{url} not found.")
        return

    # Check if the PDF is not empty
    file = BytesIO(response.content)
    try:
        reader = PyPDF2.PdfFileReader(file)
        if reader.getNumPages() == 0:
            print(f"Skipping {url}, empty file.")
            return
    except PyPDF2.utils.PdfReadError:
        print(f"Skipping {url}, not a valid PDF.")
        return

    # Save the PDF
    with open(f'{i}.pdf', 'wb') as f:
        f.write(response.content)

    # Record the PDF as downloaded
    c.execute("INSERT OR IGNORE INTO pdfs VALUES (?, ?, ?)", (i, url, 1))
    conn.commit()

# Connect to SQLite3 database, create if not exists
conn = sqlite3.connect('pdfs.db')
c = conn.cursor()

# Create table if not exists
c.execute('''
    CREATE TABLE IF NOT EXISTS pdfs
    (id INTEGER PRIMARY KEY, url TEXT, downloaded INTEGER)
''')

# Find the maximum id in the database, and start from there.
# If the database is empty, start from 1.
c.execute("SELECT MAX(id) FROM pdfs")
result = c.fetchone()
start = 1 if result[0] is None else result[0] + 1

# Use ThreadPoolExecutor to download multiple PDFs at the same time
with ThreadPoolExecutor(max_workers=100) as executor:
    futures = {executor.submit(download_pdf, i) for i in range(start, start+1000)}
    for future in as_completed(futures):
        future.result()  # to raise exceptions if any

# Close the database connection
conn.close()