From f96a085aaf221bf20c90b9d2e90192dc4650518b Mon Sep 17 00:00:00 2001 From: Amoelle Date: Wed, 9 Jul 2025 23:43:40 +0300 Subject: [PATCH] upgrade script --- crawl.sh | 797 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 627 insertions(+), 170 deletions(-) diff --git a/crawl.sh b/crawl.sh index a646435..b94137f 100755 --- a/crawl.sh +++ b/crawl.sh @@ -1,70 +1,260 @@ #!/bin/bash -if [ $# -lt 2 ]; then - echo "Usage: $0 [--keep-temp]" - echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page wikipedia.pdf --keep-temp" - exit 1 -fi +# Enhanced Website to PDF Converter +# Supports both web crawling and local HTML processing -START_URL="$1" -OUTPUT_PDF="$2" +# set -euo pipefail + +# Configuration +readonly SCRIPT_NAME="$(basename "$0")" +readonly VERSION="2.0" +readonly MAX_DEPTH=2 +readonly MAX_PAGES=100 +readonly WEASYPRINT_TIMEOUT=120 + +# Color codes for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly BLUE='\033[0;34m' +readonly NC='\033[0m' # No Color + +# Global variables +TEMP_DIR="" KEEP_TEMP=0 -if [ "$3" = "--keep-temp" ]; then - KEEP_TEMP=1 -fi +VERBOSE=0 +MODE="" +SOURCE="" +OUTPUT_PDF="" -if ! command -v curl &> /dev/null; then - echo "Error: curl is not installed." - exit 1 -fi -if ! command -v weasyprint &> /dev/null; then - echo "Error: weasyprint is not installed." - exit 1 -fi -if ! command -v pdfunite &> /dev/null; then - echo "Error: pdfunite is not installed." - exit 1 -fi +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} -TEMP_DIR=$(mktemp -d) -echo "Working in temporary directory: $TEMP_DIR" +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} -DOMAIN=$(echo "$START_URL" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||') -BASE_URL=$(echo "$START_URL" | grep -oE 'https?://[^/]+') +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} -QUEUE_FILE="$TEMP_DIR/queue.txt" -VISITED_FILE="$TEMP_DIR/visited.txt" -CRAWL_ERROR_LOG="$TEMP_DIR/crawl_errors.log" -WEASYPRINT_ERROR_LOG="$TEMP_DIR/weasyprint_errors.log" -echo "$START_URL" > "$QUEUE_FILE" -touch "$VISITED_FILE" "$CRAWL_ERROR_LOG" "$WEASYPRINT_ERROR_LOG" +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} -MAX_DEPTH=2 -MAX_PAGES=100 -PAGE_COUNT=0 +log_verbose() { + if [ "$VERBOSE" -eq 1 ]; then + echo -e "${BLUE}[VERBOSE]${NC} $1" + fi +} -mkdir -p "$TEMP_DIR/$DOMAIN" +# Help function +show_help() { + cat << EOF +$SCRIPT_NAME v$VERSION - Enhanced Website to PDF Converter +USAGE: + $SCRIPT_NAME web [options] + $SCRIPT_NAME local [options] + +MODES: + web Crawl website starting from URL + local Process locally saved HTML files + +ARGUMENTS: + start_url Starting URL for web crawling + html_directory Directory containing HTML files for local processing + output_pdf Output PDF file path + +OPTIONS: + --keep-temp Keep temporary files for debugging + --verbose Enable verbose output + --max-depth N Maximum crawling depth (default: $MAX_DEPTH) + --max-pages N Maximum pages to crawl (default: $MAX_PAGES) + --help Show this help message + +EXAMPLES: + $SCRIPT_NAME web https://example.com/docs docs.pdf --verbose + $SCRIPT_NAME local ./saved_website/ website.pdf --keep-temp + $SCRIPT_NAME web https://en.wikipedia.org/wiki/Main_Page wiki.pdf --max-depth 1 + +DEPENDENCIES: + curl, weasyprint, pdfunite (poppler-utils) + +EOF +} + +# Cleanup function +cleanup_on_signal() { + local exit_code=$? + log_warning "Script interrupted, cleaning up..." + if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then + rm -rf "$TEMP_DIR" + log_verbose "Cleaned up temporary directory: $TEMP_DIR" + fi + exit $exit_code +} + +# Manual cleanup function +cleanup_temp() { + if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then + if [ "$KEEP_TEMP" -eq 0 ]; then + rm -rf "$TEMP_DIR" + log_verbose "Cleaned up temporary directory: $TEMP_DIR" + else + log_info "Temporary directory preserved: $TEMP_DIR" + fi + fi +} + +# Set up signal handlers for interruption only +trap cleanup_on_signal INT TERM + +# Dependency check +check_dependencies() { + local missing_deps=() + + if [ "$MODE" = "web" ] && ! command -v curl &> /dev/null; then + missing_deps+=("curl") + fi + + if ! command -v weasyprint &> /dev/null; then + missing_deps+=("weasyprint") + fi + + if ! command -v pdfunite &> /dev/null; then + missing_deps+=("pdfunite (poppler-utils)") + fi + + if [ ${#missing_deps[@]} -gt 0 ]; then + log_error "Missing dependencies: ${missing_deps[*]}" + log_error "Please install the required packages and try again." + exit 1 + fi +} + +# Parse command line arguments +parse_arguments() { + if [ $# -lt 3 ]; then + show_help + exit 1 + fi + + MODE="$1" + SOURCE="$2" + OUTPUT_PDF="$3" + shift 3 + + # Validate mode + if [ "$MODE" != "web" ] && [ "$MODE" != "local" ]; then + log_error "Invalid mode: $MODE. Use 'web' or 'local'" + exit 1 + fi + + # Parse options + while [ $# -gt 0 ]; do + case "$1" in + --keep-temp) + KEEP_TEMP=1 + ;; + --verbose) + VERBOSE=1 + ;; + --max-depth) + if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then + log_error "--max-depth requires a numeric argument" + exit 1 + fi + MAX_DEPTH="$2" + shift + ;; + --max-pages) + if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then + log_error "--max-pages requires a numeric argument" + exit 1 + fi + MAX_PAGES="$2" + shift + ;; + --help) + show_help + exit 0 + ;; + *) + log_error "Unknown option: $1" + exit 1 + ;; + esac + shift + done +} + +# Validate inputs +validate_inputs() { + if [ "$MODE" = "web" ]; then + if ! [[ "$SOURCE" =~ ^https?:// ]]; then + log_error "Invalid URL format: $SOURCE" + exit 1 + fi + elif [ "$MODE" = "local" ]; then + if [ ! -d "$SOURCE" ]; then + log_error "Directory not found: $SOURCE" + exit 1 + fi + SOURCE="$(realpath "$SOURCE")" + fi + + # Check if output directory exists + local output_dir="$(dirname "$OUTPUT_PDF")" + if [ ! -d "$output_dir" ]; then + log_error "Output directory does not exist: $output_dir" + exit 1 + fi +} + +# Initialize working environment +init_environment() { + TEMP_DIR=$(mktemp -d) + log_verbose "Created temporary directory: $TEMP_DIR" + + # Create log files + mkdir -p "$TEMP_DIR/logs" + touch "$TEMP_DIR/logs/crawl_errors.log" + touch "$TEMP_DIR/logs/weasyprint_errors.log" + touch "$TEMP_DIR/logs/processing.log" +} + +# URL normalization functions (for web mode) normalize_url() { local url="$1" - local base="$2" + local base_url="$2" + local current_url="$3" + if [[ "$url" =~ ^/ ]]; then - url="$BASE_URL$url" + url="$base_url$url" elif [[ ! "$url" =~ ^https?:// ]]; then url=$(echo "$url" | sed 's|^\./||') - url="$BASE_URL/$(dirname "$base" | sed "s|^$BASE_URL||")/$url" + local current_dir="$(dirname "$current_url" | sed "s|^$base_url||")" + url="$base_url$current_dir/$url" fi + + # Clean up URL url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//') echo "$url" } url_to_filepath() { local url="$1" - local path=$(echo "$url" | sed "s|^$BASE_URL/||; s|/$||; s|/|_|g") + local base_url="$2" + local domain="$3" + + local path=$(echo "$url" | sed "s|^$base_url/||; s|/$||; s|/|_|g") if [ -z "$path" ]; then path="index" fi - echo "$TEMP_DIR/$DOMAIN/$path.html" + echo "$TEMP_DIR/$domain/$path.html" } calculate_depth() { @@ -77,153 +267,420 @@ calculate_depth() { fi } -echo "Crawling $START_URL (links within $DOMAIN, max depth $MAX_DEPTH, max pages $MAX_PAGES)..." -while [ -s "$QUEUE_FILE" ] && [ "$PAGE_COUNT" -lt "$MAX_PAGES" ]; do - CURRENT_URL=$(head -n 1 "$QUEUE_FILE") - sed -i '1d' "$QUEUE_FILE" - - if grep -Fx "$CURRENT_URL" "$VISITED_FILE" >/dev/null; then - echo "Skipping already visited: $CURRENT_URL" >> "$CRAWL_ERROR_LOG" - continue - fi - - echo "$CURRENT_URL" >> "$VISITED_FILE" - - DEPTH=$(calculate_depth "$CURRENT_URL") - echo "Processing $CURRENT_URL (depth $DEPTH)" >> "$CRAWL_ERROR_LOG" - if [ "$DEPTH" -gt "$MAX_DEPTH" ]; then - echo "Skipping (depth $DEPTH > $MAX_DEPTH): $CURRENT_URL" >> "$CRAWL_ERROR_LOG" - continue - fi - - FILE_PATH=$(url_to_filepath "$CURRENT_URL") - echo "Downloading $CURRENT_URL to $FILE_PATH..." - curl -s -L --fail --retry 3 --retry-delay 2 "$CURRENT_URL" > "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG" - if [ $? -ne 0 ]; then - echo "Warning: Failed to download $CURRENT_URL" >> "$CRAWL_ERROR_LOG" - continue - fi - - ((PAGE_COUNT++)) - echo "Crawled page $PAGE_COUNT: $CURRENT_URL" - - LINKS=$(grep -o ']*href=["'"'"'][^"'"'"']*["'"'"']' "$FILE_PATH" | sed 's/.*href=["'"'"']\([^"'"'"']*\)["'"'"'].*/\1/' | sort -u) - for LINK in $LINKS; do - if [ -z "$LINK" ] || [[ "$LINK" =~ ^(javascript:|mailto:|#|data:) ]]; then +# Web crawling function +crawl_website() { + local start_url="$SOURCE" + local domain=$(echo "$start_url" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||') + local base_url=$(echo "$start_url" | grep -oE 'https?://[^/]+') + + log_info "Starting web crawl from: $start_url" + log_info "Domain: $domain, Max depth: $MAX_DEPTH, Max pages: $MAX_PAGES" + + local queue_file="$TEMP_DIR/queue.txt" + local visited_file="$TEMP_DIR/visited.txt" + local crawl_log="$TEMP_DIR/logs/crawl_errors.log" + + echo "$start_url" > "$queue_file" + touch "$visited_file" + + mkdir -p "$TEMP_DIR/$domain" + + local page_count=0 + + while [ -s "$queue_file" ] && [ "$page_count" -lt "$MAX_PAGES" ]; do + local current_url=$(head -n 1 "$queue_file") + sed -i '1d' "$queue_file" + + if grep -Fx "$current_url" "$visited_file" >/dev/null; then + log_verbose "Skipping already visited: $current_url" continue fi - - NORMALIZED_LINK=$(normalize_url "$LINK" "$CURRENT_URL") - if [[ "$NORMALIZED_LINK" =~ ^https?://$DOMAIN ]]; then - LOCAL_PATH=$(url_to_filepath "$NORMALIZED_LINK") - RELATIVE_PATH=$(realpath --relative-to="$(dirname "$FILE_PATH")" "$LOCAL_PATH" 2>/dev/null || echo "$LOCAL_PATH") + echo "$current_url" >> "$visited_file" + + local depth=$(calculate_depth "$current_url") + log_verbose "Processing $current_url (depth $depth)" + + if [ "$depth" -gt "$MAX_DEPTH" ]; then + log_verbose "Skipping (depth $depth > $MAX_DEPTH): $current_url" + continue + fi + + local file_path=$(url_to_filepath "$current_url" "$base_url" "$domain") + log_verbose "Downloading $current_url to $file_path" + + if curl -s -L --fail --retry 3 --retry-delay 2 --max-time 30 "$current_url" > "$file_path" 2>> "$crawl_log"; then + ((page_count++)) + log_info "Crawled page $page_count: $current_url" - ESCAPED_LINK=$(echo "$LINK" | sed 's/[&]/\\&/g') - sed -i "s|href=[\"']${ESCAPED_LINK}[\"']|href=\"$RELATIVE_PATH\"|g" "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG" + # Extract and process links + local links=$(grep -o ']*href=["\047][^"\047]*["\047]' "$file_path" 2>/dev/null | sed 's/.*href=["\047]\([^"\047]*\)["\047].*/\1/' | sort -u) - if ! grep -Fx "$NORMALIZED_LINK" "$VISITED_FILE" >/dev/null; then - echo "$NORMALIZED_LINK" >> "$QUEUE_FILE" - echo "Queued: $NORMALIZED_LINK" >> "$CRAWL_ERROR_LOG" - fi + for link in $links; do + if [ -z "$link" ] || [[ "$link" =~ ^(javascript:|mailto:|#|data:) ]]; then + continue + fi + + local normalized_link=$(normalize_url "$link" "$base_url" "$current_url") + + if [[ "$normalized_link" =~ ^https?://$domain ]]; then + local local_path=$(url_to_filepath "$normalized_link" "$base_url" "$domain") + local relative_path=$(realpath --relative-to="$(dirname "$file_path")" "$local_path" 2>/dev/null || echo "$local_path") + + # Update link in HTML file + local escaped_link=$(echo "$link" | sed 's/[&]/\\&/g; s/[[\.*^$(){}+?|\\]/\\&/g') + sed -i "s|href=[\"']${escaped_link}[\"']|href=\"$relative_path\"|g" "$file_path" 2>> "$crawl_log" + + if ! grep -Fx "$normalized_link" "$visited_file" >/dev/null; then + echo "$normalized_link" >> "$queue_file" + log_verbose "Queued: $normalized_link" + fi + fi + done + else + log_warning "Failed to download $current_url" fi done -done + + log_success "Crawled $page_count pages" +} -echo "Generating list of HTML files..." -find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) > "$TEMP_DIR/file_list.txt" +# Local HTML processing function +process_local_html() { + local html_dir="$SOURCE" + log_info "Processing local HTML files from: $html_dir" + + # Find all HTML files recursively + local html_files=() + while IFS= read -r -d '' file; do + html_files+=("$file") + done < <(find "$html_dir" -type f \( -name "*.html" -o -name "*.htm" \) -print0) + + if [ ${#html_files[@]} -eq 0 ]; then + log_error "No HTML files found in $html_dir" + exit 1 + fi + + log_info "Found ${#html_files[@]} HTML files" + + # Copy HTML files to temp directory, maintaining structure + local temp_html_dir="$TEMP_DIR/html" + mkdir -p "$temp_html_dir" + + for html_file in "${html_files[@]}"; do + local rel_path=$(realpath --relative-to="$html_dir" "$html_file") + local dest_path="$temp_html_dir/$rel_path" + local dest_dir=$(dirname "$dest_path") + + mkdir -p "$dest_dir" + cp "$html_file" "$dest_path" + log_verbose "Copied: $rel_path" + done + + # Also copy any associated assets (CSS, JS, images) + local asset_extensions=("css" "js" "png" "jpg" "jpeg" "gif" "svg" "ico" "woff" "woff2" "ttf" "eot") + for ext in "${asset_extensions[@]}"; do + while IFS= read -r -d '' file; do + local rel_path=$(realpath --relative-to="$html_dir" "$file") + local dest_path="$temp_html_dir/$rel_path" + local dest_dir=$(dirname "$dest_path") + + mkdir -p "$dest_dir" + cp "$file" "$dest_path" + log_verbose "Copied asset: $rel_path" + done < <(find "$html_dir" -type f -name "*.${ext}" -print0 2>/dev/null) + done + + log_success "Processed local HTML files" +} -if [ ! -s "$TEMP_DIR/file_list.txt" ]; then - echo "Error: No HTML files found. Check crawl_errors.log for details." - cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" - cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" - cp "$QUEUE_FILE" "./queue.log" 2>/dev/null - cp "$VISITED_FILE" "./visited.log" 2>/dev/null - cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null - if [ $KEEP_TEMP -eq 0 ]; then - rm -rf "$TEMP_DIR" +# Try alternative conversion for problematic files +try_alternative_conversion() { + local html_file="$1" + local pdf_file="$2" + local html_dir="$3" + local weasyprint_log="$4" + + log_verbose "Trying alternative conversion for $(basename "$html_file")" + + # Try with simpler options (no base URL, different media type) + if timeout 30 weasyprint \ + --media-type screen \ + --pdf-version 1.4 \ + "$html_file" "$pdf_file" 2>> "$weasyprint_log"; then + return 0 + fi + + # Try with minimal options + if timeout 30 weasyprint \ + "$html_file" "$pdf_file" 2>> "$weasyprint_log"; then + return 0 + fi + + return 1 +} + +# PDF generation function +generate_pdfs() { + local html_files=() + local pdf_files=() + + # Find HTML files in temp directory + while IFS= read -r -d '' file; do + html_files+=("$file") + done < <(find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) -print0) + + if [ ${#html_files[@]} -eq 0 ]; then + log_error "No HTML files found for PDF generation" + exit 1 + fi + + # Sort files for consistent processing order + IFS=$'\n' html_files=($(sort <<<"${html_files[*]}")) + unset IFS + + log_info "Converting ${#html_files[@]} HTML files to PDF" + + local success_count=0 + local failed_count=0 + local current_file=0 + local weasyprint_log="$TEMP_DIR/logs/weasyprint_errors.log" + + for html_file in "${html_files[@]}"; do + ((current_file++)) + local html_dir=$(dirname "$html_file") + local pdf_file="${html_file%.html}.pdf" + pdf_file="${pdf_file%.htm}.pdf" + local filename=$(basename "$html_file") + + log_info "[$current_file/${#html_files[@]}] Converting $filename to PDF..." + + local start_time=$(date +%s) + + log_verbose "Running WeasyPrint on: $html_file" + if timeout "$WEASYPRINT_TIMEOUT" weasyprint \ + --base-url "file://$html_dir/" \ + --media-type print \ + --pdf-version 1.7 \ + --verbose \ + "$html_file" "$pdf_file" 2>> "$weasyprint_log"; then + + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + + if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then + pdf_files+=("$pdf_file") + ((success_count++)) + log_success "✓ $filename converted successfully (${duration}s)" + else + log_warning "✗ $filename: PDF file is empty or missing" + ((failed_count++)) + fi + else + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + + if [ $duration -ge $WEASYPRINT_TIMEOUT ]; then + log_warning "✗ $filename: Conversion timed out after ${WEASYPRINT_TIMEOUT}s" + else + log_warning "✗ $filename: Conversion failed (${duration}s)" + fi + + # Try alternative conversion + if try_alternative_conversion "$html_file" "$pdf_file" "$html_dir" "$weasyprint_log"; then + if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then + pdf_files+=("$pdf_file") + ((success_count++)) + log_success "✓ $filename converted with alternative method" + else + ((failed_count++)) + fi + else + ((failed_count++)) + fi + fi + + # Show progress + if [ $((current_file % 10)) -eq 0 ] || [ $current_file -eq ${#html_files[@]} ]; then + log_info "Progress: $current_file/${#html_files[@]} (Success: $success_count, Failed: $failed_count)" + fi + done + + if [ ${#pdf_files[@]} -eq 0 ]; then + log_error "No PDFs were generated successfully" + exit 1 + fi + + log_success "Generated $success_count PDFs successfully" + if [ $failed_count -gt 0 ]; then + log_warning "$failed_count files failed to convert" + fi + + # Merge PDFs + log_info "Merging ${#pdf_files[@]} PDFs into $OUTPUT_PDF..." + if pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" 2>> "$TEMP_DIR/logs/processing.log"; then + log_success "Successfully created $OUTPUT_PDF" + if command -v du &> /dev/null; then + local file_size=$(du -h "$OUTPUT_PDF" | cut -f1) + log_info "Final PDF size: $file_size" + fi else - echo "Temporary directory preserved: $TEMP_DIR" + log_error "Failed to merge PDFs" + exit 1 fi - exit 1 -fi +} -echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files." - -pdf_files=() - -while IFS= read -r html_file; do - html_dir=$(dirname "$html_file") +# Copy debug files +copy_debug_files() { + local debug_files=( + "crawl_errors.log" + "weasyprint_errors.log" + "processing.log" + ) - if [ ! -f "$html_file" ]; then - echo "Warning: $html_file not found, skipping." >> "$WEASYPRINT_ERROR_LOG" - continue + for file in "${debug_files[@]}"; do + if [ -f "$TEMP_DIR/logs/$file" ]; then + cp "$TEMP_DIR/logs/$file" "./$file" + fi + done + + # Copy other useful files + if [ -f "$TEMP_DIR/queue.txt" ]; then + cp "$TEMP_DIR/queue.txt" "./queue.log" fi - pdf_file="${html_file%.html}.pdf" - pdf_file="${pdf_file%.htm}.pdf" - - echo "Converting $html_file to PDF..." - weasyprint \ - --base-url "file://$html_dir/" \ - --media-type print \ - "$html_file" "$pdf_file" 2>> "$WEASYPRINT_ERROR_LOG" - - if [ $? -ne 0 ]; then - echo "Warning: Failed to convert $html_file to PDF, see weasyprint_errors.log for details." >> "$WEASYPRINT_ERROR_LOG" - continue + if [ -f "$TEMP_DIR/visited.txt" ]; then + cp "$TEMP_DIR/visited.txt" "./visited.log" fi - pdf_files+=("$pdf_file") -done < "$TEMP_DIR/file_list.txt" + log_info "Debug files saved to current directory" +} -if [ ${#pdf_files[@]} -eq 0 ]; then - echo "Error: No PDFs were generated. Check weasyprint_errors.log for details." - cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" - cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" - cp "$QUEUE_FILE" "./queue.log" 2>/dev/null - cp "$VISITED_FILE" "./visited.log" 2>/dev/null - cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null - if [ $KEEP_TEMP -eq 0 ]; then - rm -rf "$TEMP_DIR" +# Main function +main() { + log_info "$SCRIPT_NAME v$VERSION starting..." + + parse_arguments "$@" + validate_inputs + check_dependencies + init_environment + + if [ "$MODE" = "web" ]; then + crawl_website + elif [ "$MODE" = "local" ]; then + process_local_html + fi + + generate_pdfs + copy_debug_files + cleanup_temp + + log_success "Process completed successfully!" + log_info "Output: $OUTPUT_PDF" + + # Sort files for consistent processing order + IFS=$'\n' html_files=($(sort <<<"${html_files[*]}")) + unset IFS + log_info "Converting ${#html_files[@]} HTML files to PDF" + + local success_count=0 + local failed_count=0 + local current_file=0 + local weasyprint_log="$TEMP_DIR/logs/weasyprint_errors.log" + + for html_file in "${html_files[@]}"; do + ((current_file++)) + local html_dir=$(dirname "$html_file") + local pdf_file="${html_file%.html}.pdf" + pdf_file="${pdf_file%.htm}.pdf" + local filename=$(basename "$html_file") + + log_info "[$current_file/${#html_files[@]}] Converting $filename to PDF..." + + # Add progress indicators and better error handling + local start_time=$(date +%s) + + if timeout "$WEASYPRINT_TIMEOUT" weasyprint \ + --base-url "file://$html_dir/" \ + --media-type print \ + --pdf-version 1.7 \ + --verbose \ + "$html_file" "$pdf_file" 2>> "$weasyprint_log"; then + + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + + if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then + pdf_files+=("$pdf_file") + ((success_count++)) + log_success "✓ $filename converted successfully (${duration}s)" + else + log_warning "✗ $filename: PDF file is empty or missing" + ((failed_count++)) + fi + else + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + + if [ $duration -ge $WEASYPRINT_TIMEOUT ]; then + log_warning "✗ $filename: Conversion timed out after ${WEASYPRINT_TIMEOUT}s" + else + log_warning "✗ $filename: Conversion failed (${duration}s)" + fi + + # Try alternative conversion method + log_verbose "Attempting alternative conversion for $filename" + if try_alternative_conversion "$html_file" "$pdf_file" "$html_dir" "$weasyprint_log"; then + if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then + pdf_files+=("$pdf_file") + ((success_count++)) + log_success "✓ $filename converted with alternative method" + else + ((failed_count++)) + fi + else + ((failed_count++)) + fi + + # Log the last few lines of the error for this file + echo "=== Error for $filename ===" >> "$weasyprint_log" + tail -n 10 "$weasyprint_log" >> "$weasyprint_log" + fi + + # Show progress + if [ $((current_file % 10)) -eq 0 ] || [ $current_file -eq ${#html_files[@]} ]; then + log_info "Progress: $current_file/${#html_files[@]} files processed (Success: $success_count, Failed: $failed_count)" + fi + done + + if [ ${#pdf_files[@]} -eq 0 ]; then + log_error "No PDFs were generated successfully" + log_error "Check weasyprint_errors.log for details" + exit 1 + fi + + log_success "Generated $success_count PDFs successfully" + if [ $failed_count -gt 0 ]; then + log_warning "$failed_count files failed to convert" + fi + + # Merge PDFs + log_info "Merging ${#pdf_files[@]} PDFs into $OUTPUT_PDF..." + + if pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" 2>> "$TEMP_DIR/logs/processing.log"; then + log_success "Successfully created $OUTPUT_PDF" + + # Show final file info + if command -v du &> /dev/null; then + local file_size=$(du -h "$OUTPUT_PDF" | cut -f1) + log_info "Final PDF size: $file_size" + fi else - echo "Temporary directory preserved: $TEMP_DIR" + log_error "Failed to merge PDFs" + log_error "Check processing.log for details" + exit 1 fi - exit 1 -fi +} -echo "Generated ${#pdf_files[@]} PDFs." - -echo "Merging PDFs into $OUTPUT_PDF..." -pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" - -if [ $? -eq 0 ]; then - echo "Successfully created $OUTPUT_PDF" -else - echo "Error merging PDFs." - cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" - cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" - cp "$QUEUE_FILE" "./queue.log" 2>/dev/null - cp "$VISITED_FILE" "./visited.log" 2>/dev/null - cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null - if [ $KEEP_TEMP -eq 0 ]; then - rm -rf "$TEMP_DIR" - else - echo "Temporary directory preserved: $TEMP_DIR" - fi - exit 1 -fi - -cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" -cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" -cp "$QUEUE_FILE" "./queue.log" 2>/dev/null -cp "$VISITED_FILE" "./visited.log" 2>/dev/null -cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null -echo "Debug files saved: crawl_errors.log, weasyprint_errors.log, queue.log, visited.log, file_list.log" - -if [ $KEEP_TEMP -eq 0 ]; then - rm -rf "$TEMP_DIR" - echo "Cleaned up temporary files." -else - echo "Temporary directory preserved: $TEMP_DIR" -fi +# Run main function with all arguments +main "$@"