upgrade script
This commit is contained in:
797
crawl.sh
797
crawl.sh
@@ -1,70 +1,260 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "Usage: $0 <start_url> <output_pdf> [--keep-temp]"
|
||||
echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page wikipedia.pdf --keep-temp"
|
||||
exit 1
|
||||
fi
|
||||
# Enhanced Website to PDF Converter
|
||||
# Supports both web crawling and local HTML processing
|
||||
|
||||
START_URL="$1"
|
||||
OUTPUT_PDF="$2"
|
||||
# set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
readonly SCRIPT_NAME="$(basename "$0")"
|
||||
readonly VERSION="2.0"
|
||||
readonly MAX_DEPTH=2
|
||||
readonly MAX_PAGES=100
|
||||
readonly WEASYPRINT_TIMEOUT=120
|
||||
|
||||
# Color codes for output
|
||||
readonly RED='\033[0;31m'
|
||||
readonly GREEN='\033[0;32m'
|
||||
readonly YELLOW='\033[1;33m'
|
||||
readonly BLUE='\033[0;34m'
|
||||
readonly NC='\033[0m' # No Color
|
||||
|
||||
# Global variables
|
||||
TEMP_DIR=""
|
||||
KEEP_TEMP=0
|
||||
if [ "$3" = "--keep-temp" ]; then
|
||||
KEEP_TEMP=1
|
||||
fi
|
||||
VERBOSE=0
|
||||
MODE=""
|
||||
SOURCE=""
|
||||
OUTPUT_PDF=""
|
||||
|
||||
if ! command -v curl &> /dev/null; then
|
||||
echo "Error: curl is not installed."
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v weasyprint &> /dev/null; then
|
||||
echo "Error: weasyprint is not installed."
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v pdfunite &> /dev/null; then
|
||||
echo "Error: pdfunite is not installed."
|
||||
exit 1
|
||||
fi
|
||||
# Logging functions
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
echo "Working in temporary directory: $TEMP_DIR"
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
DOMAIN=$(echo "$START_URL" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||')
|
||||
BASE_URL=$(echo "$START_URL" | grep -oE 'https?://[^/]+')
|
||||
log_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
QUEUE_FILE="$TEMP_DIR/queue.txt"
|
||||
VISITED_FILE="$TEMP_DIR/visited.txt"
|
||||
CRAWL_ERROR_LOG="$TEMP_DIR/crawl_errors.log"
|
||||
WEASYPRINT_ERROR_LOG="$TEMP_DIR/weasyprint_errors.log"
|
||||
echo "$START_URL" > "$QUEUE_FILE"
|
||||
touch "$VISITED_FILE" "$CRAWL_ERROR_LOG" "$WEASYPRINT_ERROR_LOG"
|
||||
log_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1" >&2
|
||||
}
|
||||
|
||||
MAX_DEPTH=2
|
||||
MAX_PAGES=100
|
||||
PAGE_COUNT=0
|
||||
log_verbose() {
|
||||
if [ "$VERBOSE" -eq 1 ]; then
|
||||
echo -e "${BLUE}[VERBOSE]${NC} $1"
|
||||
fi
|
||||
}
|
||||
|
||||
mkdir -p "$TEMP_DIR/$DOMAIN"
|
||||
# Help function
|
||||
show_help() {
|
||||
cat << EOF
|
||||
$SCRIPT_NAME v$VERSION - Enhanced Website to PDF Converter
|
||||
|
||||
USAGE:
|
||||
$SCRIPT_NAME web <start_url> <output_pdf> [options]
|
||||
$SCRIPT_NAME local <html_directory> <output_pdf> [options]
|
||||
|
||||
MODES:
|
||||
web Crawl website starting from URL
|
||||
local Process locally saved HTML files
|
||||
|
||||
ARGUMENTS:
|
||||
start_url Starting URL for web crawling
|
||||
html_directory Directory containing HTML files for local processing
|
||||
output_pdf Output PDF file path
|
||||
|
||||
OPTIONS:
|
||||
--keep-temp Keep temporary files for debugging
|
||||
--verbose Enable verbose output
|
||||
--max-depth N Maximum crawling depth (default: $MAX_DEPTH)
|
||||
--max-pages N Maximum pages to crawl (default: $MAX_PAGES)
|
||||
--help Show this help message
|
||||
|
||||
EXAMPLES:
|
||||
$SCRIPT_NAME web https://example.com/docs docs.pdf --verbose
|
||||
$SCRIPT_NAME local ./saved_website/ website.pdf --keep-temp
|
||||
$SCRIPT_NAME web https://en.wikipedia.org/wiki/Main_Page wiki.pdf --max-depth 1
|
||||
|
||||
DEPENDENCIES:
|
||||
curl, weasyprint, pdfunite (poppler-utils)
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
# Cleanup function
|
||||
cleanup_on_signal() {
|
||||
local exit_code=$?
|
||||
log_warning "Script interrupted, cleaning up..."
|
||||
if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
log_verbose "Cleaned up temporary directory: $TEMP_DIR"
|
||||
fi
|
||||
exit $exit_code
|
||||
}
|
||||
|
||||
# Manual cleanup function
|
||||
cleanup_temp() {
|
||||
if [ -n "$TEMP_DIR" ] && [ -d "$TEMP_DIR" ]; then
|
||||
if [ "$KEEP_TEMP" -eq 0 ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
log_verbose "Cleaned up temporary directory: $TEMP_DIR"
|
||||
else
|
||||
log_info "Temporary directory preserved: $TEMP_DIR"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Set up signal handlers for interruption only
|
||||
trap cleanup_on_signal INT TERM
|
||||
|
||||
# Dependency check
|
||||
check_dependencies() {
|
||||
local missing_deps=()
|
||||
|
||||
if [ "$MODE" = "web" ] && ! command -v curl &> /dev/null; then
|
||||
missing_deps+=("curl")
|
||||
fi
|
||||
|
||||
if ! command -v weasyprint &> /dev/null; then
|
||||
missing_deps+=("weasyprint")
|
||||
fi
|
||||
|
||||
if ! command -v pdfunite &> /dev/null; then
|
||||
missing_deps+=("pdfunite (poppler-utils)")
|
||||
fi
|
||||
|
||||
if [ ${#missing_deps[@]} -gt 0 ]; then
|
||||
log_error "Missing dependencies: ${missing_deps[*]}"
|
||||
log_error "Please install the required packages and try again."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
parse_arguments() {
|
||||
if [ $# -lt 3 ]; then
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MODE="$1"
|
||||
SOURCE="$2"
|
||||
OUTPUT_PDF="$3"
|
||||
shift 3
|
||||
|
||||
# Validate mode
|
||||
if [ "$MODE" != "web" ] && [ "$MODE" != "local" ]; then
|
||||
log_error "Invalid mode: $MODE. Use 'web' or 'local'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse options
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--keep-temp)
|
||||
KEEP_TEMP=1
|
||||
;;
|
||||
--verbose)
|
||||
VERBOSE=1
|
||||
;;
|
||||
--max-depth)
|
||||
if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then
|
||||
log_error "--max-depth requires a numeric argument"
|
||||
exit 1
|
||||
fi
|
||||
MAX_DEPTH="$2"
|
||||
shift
|
||||
;;
|
||||
--max-pages)
|
||||
if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then
|
||||
log_error "--max-pages requires a numeric argument"
|
||||
exit 1
|
||||
fi
|
||||
MAX_PAGES="$2"
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
log_error "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
}
|
||||
|
||||
# Validate inputs
|
||||
validate_inputs() {
|
||||
if [ "$MODE" = "web" ]; then
|
||||
if ! [[ "$SOURCE" =~ ^https?:// ]]; then
|
||||
log_error "Invalid URL format: $SOURCE"
|
||||
exit 1
|
||||
fi
|
||||
elif [ "$MODE" = "local" ]; then
|
||||
if [ ! -d "$SOURCE" ]; then
|
||||
log_error "Directory not found: $SOURCE"
|
||||
exit 1
|
||||
fi
|
||||
SOURCE="$(realpath "$SOURCE")"
|
||||
fi
|
||||
|
||||
# Check if output directory exists
|
||||
local output_dir="$(dirname "$OUTPUT_PDF")"
|
||||
if [ ! -d "$output_dir" ]; then
|
||||
log_error "Output directory does not exist: $output_dir"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Initialize working environment
|
||||
init_environment() {
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
log_verbose "Created temporary directory: $TEMP_DIR"
|
||||
|
||||
# Create log files
|
||||
mkdir -p "$TEMP_DIR/logs"
|
||||
touch "$TEMP_DIR/logs/crawl_errors.log"
|
||||
touch "$TEMP_DIR/logs/weasyprint_errors.log"
|
||||
touch "$TEMP_DIR/logs/processing.log"
|
||||
}
|
||||
|
||||
# URL normalization functions (for web mode)
|
||||
normalize_url() {
|
||||
local url="$1"
|
||||
local base="$2"
|
||||
local base_url="$2"
|
||||
local current_url="$3"
|
||||
|
||||
if [[ "$url" =~ ^/ ]]; then
|
||||
url="$BASE_URL$url"
|
||||
url="$base_url$url"
|
||||
elif [[ ! "$url" =~ ^https?:// ]]; then
|
||||
url=$(echo "$url" | sed 's|^\./||')
|
||||
url="$BASE_URL/$(dirname "$base" | sed "s|^$BASE_URL||")/$url"
|
||||
local current_dir="$(dirname "$current_url" | sed "s|^$base_url||")"
|
||||
url="$base_url$current_dir/$url"
|
||||
fi
|
||||
|
||||
# Clean up URL
|
||||
url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//')
|
||||
echo "$url"
|
||||
}
|
||||
|
||||
url_to_filepath() {
|
||||
local url="$1"
|
||||
local path=$(echo "$url" | sed "s|^$BASE_URL/||; s|/$||; s|/|_|g")
|
||||
local base_url="$2"
|
||||
local domain="$3"
|
||||
|
||||
local path=$(echo "$url" | sed "s|^$base_url/||; s|/$||; s|/|_|g")
|
||||
if [ -z "$path" ]; then
|
||||
path="index"
|
||||
fi
|
||||
echo "$TEMP_DIR/$DOMAIN/$path.html"
|
||||
echo "$TEMP_DIR/$domain/$path.html"
|
||||
}
|
||||
|
||||
calculate_depth() {
|
||||
@@ -77,153 +267,420 @@ calculate_depth() {
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Crawling $START_URL (links within $DOMAIN, max depth $MAX_DEPTH, max pages $MAX_PAGES)..."
|
||||
while [ -s "$QUEUE_FILE" ] && [ "$PAGE_COUNT" -lt "$MAX_PAGES" ]; do
|
||||
CURRENT_URL=$(head -n 1 "$QUEUE_FILE")
|
||||
sed -i '1d' "$QUEUE_FILE"
|
||||
|
||||
if grep -Fx "$CURRENT_URL" "$VISITED_FILE" >/dev/null; then
|
||||
echo "Skipping already visited: $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "$CURRENT_URL" >> "$VISITED_FILE"
|
||||
|
||||
DEPTH=$(calculate_depth "$CURRENT_URL")
|
||||
echo "Processing $CURRENT_URL (depth $DEPTH)" >> "$CRAWL_ERROR_LOG"
|
||||
if [ "$DEPTH" -gt "$MAX_DEPTH" ]; then
|
||||
echo "Skipping (depth $DEPTH > $MAX_DEPTH): $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||
continue
|
||||
fi
|
||||
|
||||
FILE_PATH=$(url_to_filepath "$CURRENT_URL")
|
||||
echo "Downloading $CURRENT_URL to $FILE_PATH..."
|
||||
curl -s -L --fail --retry 3 --retry-delay 2 "$CURRENT_URL" > "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Warning: Failed to download $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||
continue
|
||||
fi
|
||||
|
||||
((PAGE_COUNT++))
|
||||
echo "Crawled page $PAGE_COUNT: $CURRENT_URL"
|
||||
|
||||
LINKS=$(grep -o '<a[^>]*href=["'"'"'][^"'"'"']*["'"'"']' "$FILE_PATH" | sed 's/.*href=["'"'"']\([^"'"'"']*\)["'"'"'].*/\1/' | sort -u)
|
||||
for LINK in $LINKS; do
|
||||
if [ -z "$LINK" ] || [[ "$LINK" =~ ^(javascript:|mailto:|#|data:) ]]; then
|
||||
# Web crawling function
|
||||
crawl_website() {
|
||||
local start_url="$SOURCE"
|
||||
local domain=$(echo "$start_url" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||')
|
||||
local base_url=$(echo "$start_url" | grep -oE 'https?://[^/]+')
|
||||
|
||||
log_info "Starting web crawl from: $start_url"
|
||||
log_info "Domain: $domain, Max depth: $MAX_DEPTH, Max pages: $MAX_PAGES"
|
||||
|
||||
local queue_file="$TEMP_DIR/queue.txt"
|
||||
local visited_file="$TEMP_DIR/visited.txt"
|
||||
local crawl_log="$TEMP_DIR/logs/crawl_errors.log"
|
||||
|
||||
echo "$start_url" > "$queue_file"
|
||||
touch "$visited_file"
|
||||
|
||||
mkdir -p "$TEMP_DIR/$domain"
|
||||
|
||||
local page_count=0
|
||||
|
||||
while [ -s "$queue_file" ] && [ "$page_count" -lt "$MAX_PAGES" ]; do
|
||||
local current_url=$(head -n 1 "$queue_file")
|
||||
sed -i '1d' "$queue_file"
|
||||
|
||||
if grep -Fx "$current_url" "$visited_file" >/dev/null; then
|
||||
log_verbose "Skipping already visited: $current_url"
|
||||
continue
|
||||
fi
|
||||
|
||||
NORMALIZED_LINK=$(normalize_url "$LINK" "$CURRENT_URL")
|
||||
|
||||
if [[ "$NORMALIZED_LINK" =~ ^https?://$DOMAIN ]]; then
|
||||
LOCAL_PATH=$(url_to_filepath "$NORMALIZED_LINK")
|
||||
RELATIVE_PATH=$(realpath --relative-to="$(dirname "$FILE_PATH")" "$LOCAL_PATH" 2>/dev/null || echo "$LOCAL_PATH")
|
||||
echo "$current_url" >> "$visited_file"
|
||||
|
||||
local depth=$(calculate_depth "$current_url")
|
||||
log_verbose "Processing $current_url (depth $depth)"
|
||||
|
||||
if [ "$depth" -gt "$MAX_DEPTH" ]; then
|
||||
log_verbose "Skipping (depth $depth > $MAX_DEPTH): $current_url"
|
||||
continue
|
||||
fi
|
||||
|
||||
local file_path=$(url_to_filepath "$current_url" "$base_url" "$domain")
|
||||
log_verbose "Downloading $current_url to $file_path"
|
||||
|
||||
if curl -s -L --fail --retry 3 --retry-delay 2 --max-time 30 "$current_url" > "$file_path" 2>> "$crawl_log"; then
|
||||
((page_count++))
|
||||
log_info "Crawled page $page_count: $current_url"
|
||||
|
||||
ESCAPED_LINK=$(echo "$LINK" | sed 's/[&]/\\&/g')
|
||||
sed -i "s|href=[\"']${ESCAPED_LINK}[\"']|href=\"$RELATIVE_PATH\"|g" "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
|
||||
# Extract and process links
|
||||
local links=$(grep -o '<a[^>]*href=["\047][^"\047]*["\047]' "$file_path" 2>/dev/null | sed 's/.*href=["\047]\([^"\047]*\)["\047].*/\1/' | sort -u)
|
||||
|
||||
if ! grep -Fx "$NORMALIZED_LINK" "$VISITED_FILE" >/dev/null; then
|
||||
echo "$NORMALIZED_LINK" >> "$QUEUE_FILE"
|
||||
echo "Queued: $NORMALIZED_LINK" >> "$CRAWL_ERROR_LOG"
|
||||
fi
|
||||
for link in $links; do
|
||||
if [ -z "$link" ] || [[ "$link" =~ ^(javascript:|mailto:|#|data:) ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
local normalized_link=$(normalize_url "$link" "$base_url" "$current_url")
|
||||
|
||||
if [[ "$normalized_link" =~ ^https?://$domain ]]; then
|
||||
local local_path=$(url_to_filepath "$normalized_link" "$base_url" "$domain")
|
||||
local relative_path=$(realpath --relative-to="$(dirname "$file_path")" "$local_path" 2>/dev/null || echo "$local_path")
|
||||
|
||||
# Update link in HTML file
|
||||
local escaped_link=$(echo "$link" | sed 's/[&]/\\&/g; s/[[\.*^$(){}+?|\\]/\\&/g')
|
||||
sed -i "s|href=[\"']${escaped_link}[\"']|href=\"$relative_path\"|g" "$file_path" 2>> "$crawl_log"
|
||||
|
||||
if ! grep -Fx "$normalized_link" "$visited_file" >/dev/null; then
|
||||
echo "$normalized_link" >> "$queue_file"
|
||||
log_verbose "Queued: $normalized_link"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
else
|
||||
log_warning "Failed to download $current_url"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
log_success "Crawled $page_count pages"
|
||||
}
|
||||
|
||||
echo "Generating list of HTML files..."
|
||||
find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) > "$TEMP_DIR/file_list.txt"
|
||||
# Local HTML processing function
|
||||
process_local_html() {
|
||||
local html_dir="$SOURCE"
|
||||
log_info "Processing local HTML files from: $html_dir"
|
||||
|
||||
# Find all HTML files recursively
|
||||
local html_files=()
|
||||
while IFS= read -r -d '' file; do
|
||||
html_files+=("$file")
|
||||
done < <(find "$html_dir" -type f \( -name "*.html" -o -name "*.htm" \) -print0)
|
||||
|
||||
if [ ${#html_files[@]} -eq 0 ]; then
|
||||
log_error "No HTML files found in $html_dir"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Found ${#html_files[@]} HTML files"
|
||||
|
||||
# Copy HTML files to temp directory, maintaining structure
|
||||
local temp_html_dir="$TEMP_DIR/html"
|
||||
mkdir -p "$temp_html_dir"
|
||||
|
||||
for html_file in "${html_files[@]}"; do
|
||||
local rel_path=$(realpath --relative-to="$html_dir" "$html_file")
|
||||
local dest_path="$temp_html_dir/$rel_path"
|
||||
local dest_dir=$(dirname "$dest_path")
|
||||
|
||||
mkdir -p "$dest_dir"
|
||||
cp "$html_file" "$dest_path"
|
||||
log_verbose "Copied: $rel_path"
|
||||
done
|
||||
|
||||
# Also copy any associated assets (CSS, JS, images)
|
||||
local asset_extensions=("css" "js" "png" "jpg" "jpeg" "gif" "svg" "ico" "woff" "woff2" "ttf" "eot")
|
||||
for ext in "${asset_extensions[@]}"; do
|
||||
while IFS= read -r -d '' file; do
|
||||
local rel_path=$(realpath --relative-to="$html_dir" "$file")
|
||||
local dest_path="$temp_html_dir/$rel_path"
|
||||
local dest_dir=$(dirname "$dest_path")
|
||||
|
||||
mkdir -p "$dest_dir"
|
||||
cp "$file" "$dest_path"
|
||||
log_verbose "Copied asset: $rel_path"
|
||||
done < <(find "$html_dir" -type f -name "*.${ext}" -print0 2>/dev/null)
|
||||
done
|
||||
|
||||
log_success "Processed local HTML files"
|
||||
}
|
||||
|
||||
if [ ! -s "$TEMP_DIR/file_list.txt" ]; then
|
||||
echo "Error: No HTML files found. Check crawl_errors.log for details."
|
||||
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||
if [ $KEEP_TEMP -eq 0 ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
# Try alternative conversion for problematic files
|
||||
try_alternative_conversion() {
|
||||
local html_file="$1"
|
||||
local pdf_file="$2"
|
||||
local html_dir="$3"
|
||||
local weasyprint_log="$4"
|
||||
|
||||
log_verbose "Trying alternative conversion for $(basename "$html_file")"
|
||||
|
||||
# Try with simpler options (no base URL, different media type)
|
||||
if timeout 30 weasyprint \
|
||||
--media-type screen \
|
||||
--pdf-version 1.4 \
|
||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Try with minimal options
|
||||
if timeout 30 weasyprint \
|
||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# PDF generation function
|
||||
generate_pdfs() {
|
||||
local html_files=()
|
||||
local pdf_files=()
|
||||
|
||||
# Find HTML files in temp directory
|
||||
while IFS= read -r -d '' file; do
|
||||
html_files+=("$file")
|
||||
done < <(find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) -print0)
|
||||
|
||||
if [ ${#html_files[@]} -eq 0 ]; then
|
||||
log_error "No HTML files found for PDF generation"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sort files for consistent processing order
|
||||
IFS=$'\n' html_files=($(sort <<<"${html_files[*]}"))
|
||||
unset IFS
|
||||
|
||||
log_info "Converting ${#html_files[@]} HTML files to PDF"
|
||||
|
||||
local success_count=0
|
||||
local failed_count=0
|
||||
local current_file=0
|
||||
local weasyprint_log="$TEMP_DIR/logs/weasyprint_errors.log"
|
||||
|
||||
for html_file in "${html_files[@]}"; do
|
||||
((current_file++))
|
||||
local html_dir=$(dirname "$html_file")
|
||||
local pdf_file="${html_file%.html}.pdf"
|
||||
pdf_file="${pdf_file%.htm}.pdf"
|
||||
local filename=$(basename "$html_file")
|
||||
|
||||
log_info "[$current_file/${#html_files[@]}] Converting $filename to PDF..."
|
||||
|
||||
local start_time=$(date +%s)
|
||||
|
||||
log_verbose "Running WeasyPrint on: $html_file"
|
||||
if timeout "$WEASYPRINT_TIMEOUT" weasyprint \
|
||||
--base-url "file://$html_dir/" \
|
||||
--media-type print \
|
||||
--pdf-version 1.7 \
|
||||
--verbose \
|
||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
|
||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
||||
pdf_files+=("$pdf_file")
|
||||
((success_count++))
|
||||
log_success "✓ $filename converted successfully (${duration}s)"
|
||||
else
|
||||
log_warning "✗ $filename: PDF file is empty or missing"
|
||||
((failed_count++))
|
||||
fi
|
||||
else
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
|
||||
if [ $duration -ge $WEASYPRINT_TIMEOUT ]; then
|
||||
log_warning "✗ $filename: Conversion timed out after ${WEASYPRINT_TIMEOUT}s"
|
||||
else
|
||||
log_warning "✗ $filename: Conversion failed (${duration}s)"
|
||||
fi
|
||||
|
||||
# Try alternative conversion
|
||||
if try_alternative_conversion "$html_file" "$pdf_file" "$html_dir" "$weasyprint_log"; then
|
||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
||||
pdf_files+=("$pdf_file")
|
||||
((success_count++))
|
||||
log_success "✓ $filename converted with alternative method"
|
||||
else
|
||||
((failed_count++))
|
||||
fi
|
||||
else
|
||||
((failed_count++))
|
||||
fi
|
||||
fi
|
||||
|
||||
# Show progress
|
||||
if [ $((current_file % 10)) -eq 0 ] || [ $current_file -eq ${#html_files[@]} ]; then
|
||||
log_info "Progress: $current_file/${#html_files[@]} (Success: $success_count, Failed: $failed_count)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${#pdf_files[@]} -eq 0 ]; then
|
||||
log_error "No PDFs were generated successfully"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Generated $success_count PDFs successfully"
|
||||
if [ $failed_count -gt 0 ]; then
|
||||
log_warning "$failed_count files failed to convert"
|
||||
fi
|
||||
|
||||
# Merge PDFs
|
||||
log_info "Merging ${#pdf_files[@]} PDFs into $OUTPUT_PDF..."
|
||||
if pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" 2>> "$TEMP_DIR/logs/processing.log"; then
|
||||
log_success "Successfully created $OUTPUT_PDF"
|
||||
if command -v du &> /dev/null; then
|
||||
local file_size=$(du -h "$OUTPUT_PDF" | cut -f1)
|
||||
log_info "Final PDF size: $file_size"
|
||||
fi
|
||||
else
|
||||
echo "Temporary directory preserved: $TEMP_DIR"
|
||||
log_error "Failed to merge PDFs"
|
||||
exit 1
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files."
|
||||
|
||||
pdf_files=()
|
||||
|
||||
while IFS= read -r html_file; do
|
||||
html_dir=$(dirname "$html_file")
|
||||
# Copy debug files
|
||||
copy_debug_files() {
|
||||
local debug_files=(
|
||||
"crawl_errors.log"
|
||||
"weasyprint_errors.log"
|
||||
"processing.log"
|
||||
)
|
||||
|
||||
if [ ! -f "$html_file" ]; then
|
||||
echo "Warning: $html_file not found, skipping." >> "$WEASYPRINT_ERROR_LOG"
|
||||
continue
|
||||
for file in "${debug_files[@]}"; do
|
||||
if [ -f "$TEMP_DIR/logs/$file" ]; then
|
||||
cp "$TEMP_DIR/logs/$file" "./$file"
|
||||
fi
|
||||
done
|
||||
|
||||
# Copy other useful files
|
||||
if [ -f "$TEMP_DIR/queue.txt" ]; then
|
||||
cp "$TEMP_DIR/queue.txt" "./queue.log"
|
||||
fi
|
||||
|
||||
pdf_file="${html_file%.html}.pdf"
|
||||
pdf_file="${pdf_file%.htm}.pdf"
|
||||
|
||||
echo "Converting $html_file to PDF..."
|
||||
weasyprint \
|
||||
--base-url "file://$html_dir/" \
|
||||
--media-type print \
|
||||
"$html_file" "$pdf_file" 2>> "$WEASYPRINT_ERROR_LOG"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Warning: Failed to convert $html_file to PDF, see weasyprint_errors.log for details." >> "$WEASYPRINT_ERROR_LOG"
|
||||
continue
|
||||
if [ -f "$TEMP_DIR/visited.txt" ]; then
|
||||
cp "$TEMP_DIR/visited.txt" "./visited.log"
|
||||
fi
|
||||
|
||||
pdf_files+=("$pdf_file")
|
||||
done < "$TEMP_DIR/file_list.txt"
|
||||
log_info "Debug files saved to current directory"
|
||||
}
|
||||
|
||||
if [ ${#pdf_files[@]} -eq 0 ]; then
|
||||
echo "Error: No PDFs were generated. Check weasyprint_errors.log for details."
|
||||
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||
if [ $KEEP_TEMP -eq 0 ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
# Main function
|
||||
main() {
|
||||
log_info "$SCRIPT_NAME v$VERSION starting..."
|
||||
|
||||
parse_arguments "$@"
|
||||
validate_inputs
|
||||
check_dependencies
|
||||
init_environment
|
||||
|
||||
if [ "$MODE" = "web" ]; then
|
||||
crawl_website
|
||||
elif [ "$MODE" = "local" ]; then
|
||||
process_local_html
|
||||
fi
|
||||
|
||||
generate_pdfs
|
||||
copy_debug_files
|
||||
cleanup_temp
|
||||
|
||||
log_success "Process completed successfully!"
|
||||
log_info "Output: $OUTPUT_PDF"
|
||||
|
||||
# Sort files for consistent processing order
|
||||
IFS=$'\n' html_files=($(sort <<<"${html_files[*]}"))
|
||||
unset IFS
|
||||
log_info "Converting ${#html_files[@]} HTML files to PDF"
|
||||
|
||||
local success_count=0
|
||||
local failed_count=0
|
||||
local current_file=0
|
||||
local weasyprint_log="$TEMP_DIR/logs/weasyprint_errors.log"
|
||||
|
||||
for html_file in "${html_files[@]}"; do
|
||||
((current_file++))
|
||||
local html_dir=$(dirname "$html_file")
|
||||
local pdf_file="${html_file%.html}.pdf"
|
||||
pdf_file="${pdf_file%.htm}.pdf"
|
||||
local filename=$(basename "$html_file")
|
||||
|
||||
log_info "[$current_file/${#html_files[@]}] Converting $filename to PDF..."
|
||||
|
||||
# Add progress indicators and better error handling
|
||||
local start_time=$(date +%s)
|
||||
|
||||
if timeout "$WEASYPRINT_TIMEOUT" weasyprint \
|
||||
--base-url "file://$html_dir/" \
|
||||
--media-type print \
|
||||
--pdf-version 1.7 \
|
||||
--verbose \
|
||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
|
||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
||||
pdf_files+=("$pdf_file")
|
||||
((success_count++))
|
||||
log_success "✓ $filename converted successfully (${duration}s)"
|
||||
else
|
||||
log_warning "✗ $filename: PDF file is empty or missing"
|
||||
((failed_count++))
|
||||
fi
|
||||
else
|
||||
local end_time=$(date +%s)
|
||||
local duration=$((end_time - start_time))
|
||||
|
||||
if [ $duration -ge $WEASYPRINT_TIMEOUT ]; then
|
||||
log_warning "✗ $filename: Conversion timed out after ${WEASYPRINT_TIMEOUT}s"
|
||||
else
|
||||
log_warning "✗ $filename: Conversion failed (${duration}s)"
|
||||
fi
|
||||
|
||||
# Try alternative conversion method
|
||||
log_verbose "Attempting alternative conversion for $filename"
|
||||
if try_alternative_conversion "$html_file" "$pdf_file" "$html_dir" "$weasyprint_log"; then
|
||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
||||
pdf_files+=("$pdf_file")
|
||||
((success_count++))
|
||||
log_success "✓ $filename converted with alternative method"
|
||||
else
|
||||
((failed_count++))
|
||||
fi
|
||||
else
|
||||
((failed_count++))
|
||||
fi
|
||||
|
||||
# Log the last few lines of the error for this file
|
||||
echo "=== Error for $filename ===" >> "$weasyprint_log"
|
||||
tail -n 10 "$weasyprint_log" >> "$weasyprint_log"
|
||||
fi
|
||||
|
||||
# Show progress
|
||||
if [ $((current_file % 10)) -eq 0 ] || [ $current_file -eq ${#html_files[@]} ]; then
|
||||
log_info "Progress: $current_file/${#html_files[@]} files processed (Success: $success_count, Failed: $failed_count)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${#pdf_files[@]} -eq 0 ]; then
|
||||
log_error "No PDFs were generated successfully"
|
||||
log_error "Check weasyprint_errors.log for details"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_success "Generated $success_count PDFs successfully"
|
||||
if [ $failed_count -gt 0 ]; then
|
||||
log_warning "$failed_count files failed to convert"
|
||||
fi
|
||||
|
||||
# Merge PDFs
|
||||
log_info "Merging ${#pdf_files[@]} PDFs into $OUTPUT_PDF..."
|
||||
|
||||
if pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" 2>> "$TEMP_DIR/logs/processing.log"; then
|
||||
log_success "Successfully created $OUTPUT_PDF"
|
||||
|
||||
# Show final file info
|
||||
if command -v du &> /dev/null; then
|
||||
local file_size=$(du -h "$OUTPUT_PDF" | cut -f1)
|
||||
log_info "Final PDF size: $file_size"
|
||||
fi
|
||||
else
|
||||
echo "Temporary directory preserved: $TEMP_DIR"
|
||||
log_error "Failed to merge PDFs"
|
||||
log_error "Check processing.log for details"
|
||||
exit 1
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Generated ${#pdf_files[@]} PDFs."
|
||||
|
||||
echo "Merging PDFs into $OUTPUT_PDF..."
|
||||
pdfunite "${pdf_files[@]}" "$OUTPUT_PDF"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Successfully created $OUTPUT_PDF"
|
||||
else
|
||||
echo "Error merging PDFs."
|
||||
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||
if [ $KEEP_TEMP -eq 0 ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
else
|
||||
echo "Temporary directory preserved: $TEMP_DIR"
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||
echo "Debug files saved: crawl_errors.log, weasyprint_errors.log, queue.log, visited.log, file_list.log"
|
||||
|
||||
if [ $KEEP_TEMP -eq 0 ]; then
|
||||
rm -rf "$TEMP_DIR"
|
||||
echo "Cleaned up temporary files."
|
||||
else
|
||||
echo "Temporary directory preserved: $TEMP_DIR"
|
||||
fi
|
||||
# Run main function with all arguments
|
||||
main "$@"
|
||||
|
Reference in New Issue
Block a user