init

2025-06-02 10:09:53 +03:00
parent cfdcd7550a
commit 2d60443be4
3 changed files with 161 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 *.pdf
+*.log
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+Vibe coded shitty tool to crawl websites into pdf files.
--- a/crawl.sh
+++ b/crawl.sh
@@ -1,28 +1,20 @@
 #!/bin/bash

-# Usage: ./crawl_to_pdf.sh <start_url> <max_depth> <output_pdf>
-# Example: ./crawl_to_pdf.sh https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf
-
-# Check if required arguments are provided
-if [ $# -ne 3 ]; then
-    echo "Usage: $0 <start_url> <max_depth> <output_pdf>"
-    echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf"
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 <start_url> <output_pdf> [--keep-temp]"
+    echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page wikipedia.pdf --keep-temp"
    exit 1
 fi

 START_URL="$1"
-MAX_DEPTH="$2"
-OUTPUT_PDF="$3"
-
-# Validate max_depth is a positive integer
-if ! [[ "$MAX_DEPTH" =~ ^[0-9]+$ ]] || [ "$MAX_DEPTH" -lt 1 ]; then
-    echo "Error: max_depth must be a positive integer."
-    exit 1
+OUTPUT_PDF="$2"
+KEEP_TEMP=0
+if [ "$3" = "--keep-temp" ]; then
+    KEEP_TEMP=1
 fi

-# Check if wget, weasyprint, and pdfunite are installed
-if ! command -v wget &> /dev/null; then
-    echo "Error: wget is not installed."
+if ! command -v curl &> /dev/null; then
+    echo "Error: curl is not installed."
    exit 1
 fi
 if ! command -v weasyprint &> /dev/null; then
@@ -34,86 +26,174 @@ if ! command -v pdfunite &> /dev/null; then
    exit 1
 fi

-# Create a temporary directory for downloads
 TEMP_DIR=$(mktemp -d)
 echo "Working in temporary directory: $TEMP_DIR"

-# Extract domain from URL to restrict crawling
-DOMAIN=$(echo "$START_URL" | grep -oP '(?<=://)[^/]+')
+DOMAIN=$(echo "$START_URL" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||')
+BASE_URL=$(echo "$START_URL" | grep -oE 'https?://[^/]+')

-# Crawl the website using wget
-echo "Crawling $START_URL (max depth: $MAX_DEPTH)..."
-wget \
-    --recursive \
-    --level="$MAX_DEPTH" \
-    --convert-links \
-    --html-extension \
-    --no-parent \
-    --domains="$DOMAIN" \
-    --no-verbose \
-    --directory-prefix="$TEMP_DIR" \
-    "$START_URL"
+QUEUE_FILE="$TEMP_DIR/queue.txt"
+VISITED_FILE="$TEMP_DIR/visited.txt"
+CRAWL_ERROR_LOG="$TEMP_DIR/crawl_errors.log"
+WEASYPRINT_ERROR_LOG="$TEMP_DIR/weasyprint_errors.log"
+echo "$START_URL" > "$QUEUE_FILE"
+touch "$VISITED_FILE" "$CRAWL_ERROR_LOG" "$WEASYPRINT_ERROR_LOG"

-if [ $? -ne 0 ]; then
-    echo "Error: Failed to crawl $START_URL."
-    rm -rf "$TEMP_DIR"
-    exit 1
-fi
+MAX_DEPTH=2
+MAX_PAGES=100
+PAGE_COUNT=0
+
+mkdir -p "$TEMP_DIR/$DOMAIN"
+
+normalize_url() {
+    local url="$1"
+    local base="$2"
+    if [[ "$url" =~ ^/ ]]; then
+        url="$BASE_URL$url"
+    elif [[ ! "$url" =~ ^https?:// ]]; then
+        url=$(echo "$url" | sed 's|^\./||')
+        url="$BASE_URL/$(dirname "$base" | sed "s|^$BASE_URL||")/$url"
+    fi
+    url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//')
+    echo "$url"
+}
+
+url_to_filepath() {
+    local url="$1"
+    local path=$(echo "$url" | sed "s|^$BASE_URL/||; s|/$||; s|/|_|g")
+    if [ -z "$path" ]; then
+        path="index"
+    fi
+    echo "$TEMP_DIR/$DOMAIN/$path.html"
+}
+
+calculate_depth() {
+    local url="$1"
+    local path=$(echo "$url" | sed "s|^https\?://[^/]*||; s|^/||; s|/$||")
+    if [ -z "$path" ]; then
+        echo 0
+    else
+        echo "$path" | awk -F'/' '{print NF}'
+    fi
+}
+
+echo "Crawling $START_URL (links within $DOMAIN, max depth $MAX_DEPTH, max pages $MAX_PAGES)..."
+while [ -s "$QUEUE_FILE" ] && [ "$PAGE_COUNT" -lt "$MAX_PAGES" ]; do
+    CURRENT_URL=$(head -n 1 "$QUEUE_FILE")
+    sed -i '1d' "$QUEUE_FILE"
+
+    if grep -Fx "$CURRENT_URL" "$VISITED_FILE" >/dev/null; then
+        echo "Skipping already visited: $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
+        continue
+    fi
+
+    echo "$CURRENT_URL" >> "$VISITED_FILE"
+
+    DEPTH=$(calculate_depth "$CURRENT_URL")
+    echo "Processing $CURRENT_URL (depth $DEPTH)" >> "$CRAWL_ERROR_LOG"
+    if [ "$DEPTH" -gt "$MAX_DEPTH" ]; then
+        echo "Skipping (depth $DEPTH > $MAX_DEPTH): $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
+        continue
+    fi
+
+    FILE_PATH=$(url_to_filepath "$CURRENT_URL")
+    echo "Downloading $CURRENT_URL to $FILE_PATH..."
+    curl -s -L --fail --retry 3 --retry-delay 2 "$CURRENT_URL" > "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
+    if [ $? -ne 0 ]; then
+        echo "Warning: Failed to download $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
+        continue
+    fi
+
+    ((PAGE_COUNT++))
+    echo "Crawled page $PAGE_COUNT: $CURRENT_URL"
+
+    LINKS=$(grep -o '<a[^>]*href=["'"'"'][^"'"'"']*["'"'"']' "$FILE_PATH" | sed 's/.*href=["'"'"']\([^"'"'"']*\)["'"'"'].*/\1/' | sort -u)
+    for LINK in $LINKS; do
+        if [ -z "$LINK" ] || [[ "$LINK" =~ ^(javascript:|mailto:|#|data:) ]]; then
+            continue
+        fi
+
+        NORMALIZED_LINK=$(normalize_url "$LINK" "$CURRENT_URL")
+        
+        if [[ "$NORMALIZED_LINK" =~ ^https?://$DOMAIN ]]; then
+            LOCAL_PATH=$(url_to_filepath "$NORMALIZED_LINK")
+            RELATIVE_PATH=$(realpath --relative-to="$(dirname "$FILE_PATH")" "$LOCAL_PATH" 2>/dev/null || echo "$LOCAL_PATH")
+            
+            ESCAPED_LINK=$(echo "$LINK" | sed 's/[&]/\\&/g')
+            sed -i "s|href=[\"']${ESCAPED_LINK}[\"']|href=\"$RELATIVE_PATH\"|g" "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
+            
+            if ! grep -Fx "$NORMALIZED_LINK" "$VISITED_FILE" >/dev/null; then
+                echo "$NORMALIZED_LINK" >> "$QUEUE_FILE"
+                echo "Queued: $NORMALIZED_LINK" >> "$CRAWL_ERROR_LOG"
+            fi
+        fi
+    done
+done

-# Generate file_list.txt with all HTML files
 echo "Generating list of HTML files..."
-find "$TEMP_DIR" -type f -name "*.html" > "$TEMP_DIR/file_list.txt"
+find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) > "$TEMP_DIR/file_list.txt"

-# Check if file_list.txt was generated successfully
 if [ ! -s "$TEMP_DIR/file_list.txt" ]; then
-    echo "Error: No HTML files found."
-    rm -rf "$TEMP_DIR"
+    echo "Error: No HTML files found. Check crawl_errors.log for details."
+    cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
+    cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
+    cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
+    cp "$VISITED_FILE" "./visited.log" 2>/dev/null
+    cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
+    if [ $KEEP_TEMP -eq 0 ]; then
+        rm -rf "$TEMP_DIR"
+    else
+        echo "Temporary directory preserved: $TEMP_DIR"
+    fi
    exit 1
 fi

 echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files."

-# Array to store generated PDF files
 pdf_files=()

-# Convert each HTML file to PDF
 while IFS= read -r html_file; do
-    # Get the directory containing the HTML file
    html_dir=$(dirname "$html_file")
    
-    # Check if the HTML file exists
    if [ ! -f "$html_file" ]; then
-        echo "Warning: $html_file not found, skipping."
+        echo "Warning: $html_file not found, skipping." >> "$WEASYPRINT_ERROR_LOG"
        continue
    fi
    
-    # Define the output PDF for each HTML file
    pdf_file="${html_file%.html}.pdf"
+    pdf_file="${pdf_file%.htm}.pdf"
    
-    # Convert HTML to PDF using weasyprint
    echo "Converting $html_file to PDF..."
    weasyprint \
        --base-url "file://$html_dir/" \
        --media-type print \
-        "$html_file" "$pdf_file"
+        "$html_file" "$pdf_file" 2>> "$WEASYPRINT_ERROR_LOG"
    
    if [ $? -ne 0 ]; then
-        echo "Warning: Failed to convert $html_file to PDF, skipping."
+        echo "Warning: Failed to convert $html_file to PDF, see weasyprint_errors.log for details." >> "$WEASYPRINT_ERROR_LOG"
        continue
    fi
    
-    # Add the PDF to the array
    pdf_files+=("$pdf_file")
 done < "$TEMP_DIR/file_list.txt"

-# Check if any PDFs were generated
 if [ ${#pdf_files[@]} -eq 0 ]; then
-    echo "Error: No PDFs were generated."
-    rm -rf "$TEMP_DIR"
+    echo "Error: No PDFs were generated. Check weasyprint_errors.log for details."
+    cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
+    cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
+    cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
+    cp "$VISITED_FILE" "./visited.log" 2>/dev/null
+    cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
+    if [ $KEEP_TEMP -eq 0 ]; then
+        rm -rf "$TEMP_DIR"
+    else
+        echo "Temporary directory preserved: $TEMP_DIR"
+    fi
    exit 1
 fi

-# Merge the PDFs into a single file
+echo "Generated ${#pdf_files[@]} PDFs."
+
 echo "Merging PDFs into $OUTPUT_PDF..."
 pdfunite "${pdf_files[@]}" "$OUTPUT_PDF"

@@ -121,10 +201,29 @@ if [ $? -eq 0 ]; then
    echo "Successfully created $OUTPUT_PDF"
 else
    echo "Error merging PDFs."
-    rm -rf "$TEMP_DIR"
+    cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
+    cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
+    cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
+    cp "$VISITED_FILE" "./visited.log" 2>/dev/null
+    cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
+    if [ $KEEP_TEMP -eq 0 ]; then
+        rm -rf "$TEMP_DIR"
+    else
+        echo "Temporary directory preserved: $TEMP_DIR"
+    fi
    exit 1
 fi

-# Clean up temporary directory
-rm -rf "$TEMP_DIR"
-echo "Cleaned up temporary files."
+cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
+cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
+cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
+cp "$VISITED_FILE" "./visited.log" 2>/dev/null
+cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
+echo "Debug files saved: crawl_errors.log, weasyprint_errors.log, queue.log, visited.log, file_list.log"
+
+if [ $KEEP_TEMP -eq 0 ]; then
+    rm -rf "$TEMP_DIR"
+    echo "Cleaned up temporary files."
+else
+    echo "Temporary directory preserved: $TEMP_DIR"
+fi
				`@@ -0,0 +1 @@`
				`Vibe coded shitty tool to crawl websites into pdf files.`