improve script
This commit is contained in:
584
crawl.sh
584
crawl.sh
@@ -1,23 +1,24 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Enhanced Website to PDF Converter
|
# Enhanced Website to PDF Converter with Browser Support
|
||||||
# Supports both web crawling and local HTML processing
|
# Supports both web crawling and local HTML processing with Cloudflare bypass
|
||||||
|
|
||||||
# set -euo pipefail
|
# set -euo pipefail
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
readonly SCRIPT_NAME="$(basename "$0")"
|
SCRIPT_NAME="$(basename "$0")"
|
||||||
readonly VERSION="2.0"
|
readonly VERSION="2.1"
|
||||||
readonly MAX_DEPTH=2
|
MAX_DEPTH=2
|
||||||
readonly MAX_PAGES=100
|
MAX_PAGES=100
|
||||||
readonly WEASYPRINT_TIMEOUT=120
|
readonly WEASYPRINT_TIMEOUT=120
|
||||||
|
readonly BROWSER_TIMEOUT=60
|
||||||
|
|
||||||
# Color codes for output
|
# Color codes for output
|
||||||
readonly RED='\033[0;31m'
|
readonly RED="\033[0;31m"
|
||||||
readonly GREEN='\033[0;32m'
|
readonly GREEN="\033[0;32m"
|
||||||
readonly YELLOW='\033[1;33m'
|
readonly YELLOW="\033[1;33m"
|
||||||
readonly BLUE='\033[0;34m'
|
readonly BLUE="\033[0;34m"
|
||||||
readonly NC='\033[0m' # No Color
|
readonly NC="\033[0m" # No Color
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
TEMP_DIR=""
|
TEMP_DIR=""
|
||||||
@@ -26,6 +27,7 @@ VERBOSE=0
|
|||||||
MODE=""
|
MODE=""
|
||||||
SOURCE=""
|
SOURCE=""
|
||||||
OUTPUT_PDF=""
|
OUTPUT_PDF=""
|
||||||
|
USE_BROWSER=0
|
||||||
|
|
||||||
# Logging functions
|
# Logging functions
|
||||||
log_info() {
|
log_info() {
|
||||||
@@ -53,7 +55,7 @@ log_verbose() {
|
|||||||
# Help function
|
# Help function
|
||||||
show_help() {
|
show_help() {
|
||||||
cat << EOF
|
cat << EOF
|
||||||
$SCRIPT_NAME v$VERSION - Enhanced Website to PDF Converter
|
$SCRIPT_NAME v$VERSION - Enhanced Website to PDF Converter with Browser Support
|
||||||
|
|
||||||
USAGE:
|
USAGE:
|
||||||
$SCRIPT_NAME web <start_url> <output_pdf> [options]
|
$SCRIPT_NAME web <start_url> <output_pdf> [options]
|
||||||
@@ -73,15 +75,17 @@ OPTIONS:
|
|||||||
--verbose Enable verbose output
|
--verbose Enable verbose output
|
||||||
--max-depth N Maximum crawling depth (default: $MAX_DEPTH)
|
--max-depth N Maximum crawling depth (default: $MAX_DEPTH)
|
||||||
--max-pages N Maximum pages to crawl (default: $MAX_PAGES)
|
--max-pages N Maximum pages to crawl (default: $MAX_PAGES)
|
||||||
|
--browser Use browser automation to bypass Cloudflare protection
|
||||||
--help Show this help message
|
--help Show this help message
|
||||||
|
|
||||||
EXAMPLES:
|
EXAMPLES:
|
||||||
$SCRIPT_NAME web https://example.com/docs docs.pdf --verbose
|
$SCRIPT_NAME web https://example.com/docs docs.pdf --verbose
|
||||||
$SCRIPT_NAME local ./saved_website/ website.pdf --keep-temp
|
$SCRIPT_NAME local ./saved_website/ website.pdf --keep-temp
|
||||||
$SCRIPT_NAME web https://en.wikipedia.org/wiki/Main_Page wiki.pdf --max-depth 1
|
$SCRIPT_NAME web https://blog.exploit.org/caster-legless/ article.pdf --browser --verbose
|
||||||
|
|
||||||
DEPENDENCIES:
|
DEPENDENCIES:
|
||||||
curl, weasyprint, pdfunite (poppler-utils)
|
curl, weasyprint, pdfunite (poppler-utils)
|
||||||
|
For --browser mode: python3, selenium, chromium-browser
|
||||||
|
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
@@ -115,22 +119,36 @@ trap cleanup_on_signal INT TERM
|
|||||||
# Dependency check
|
# Dependency check
|
||||||
check_dependencies() {
|
check_dependencies() {
|
||||||
local missing_deps=()
|
local missing_deps=()
|
||||||
|
|
||||||
if [ "$MODE" = "web" ] && ! command -v curl &> /dev/null; then
|
if [ "$MODE" = "web" ] && ! command -v curl &> /dev/null; then
|
||||||
missing_deps+=("curl")
|
missing_deps+=("curl")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! command -v weasyprint &> /dev/null; then
|
if ! command -v weasyprint &> /dev/null; then
|
||||||
missing_deps+=("weasyprint")
|
missing_deps+=("weasyprint")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! command -v pdfunite &> /dev/null; then
|
if ! command -v pdfunite &> /dev/null; then
|
||||||
missing_deps+=("pdfunite (poppler-utils)")
|
missing_deps+=("pdfunite (poppler-utils)")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$USE_BROWSER" -eq 1 ]; then
|
||||||
|
if ! command -v python3 &> /dev/null; then
|
||||||
|
missing_deps+=("python3")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if selenium is available
|
||||||
|
if ! python3 -c "import selenium" 2>/dev/null; then
|
||||||
|
missing_deps+=("python3-selenium")
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ${#missing_deps[@]} -gt 0 ]; then
|
if [ ${#missing_deps[@]} -gt 0 ]; then
|
||||||
log_error "Missing dependencies: ${missing_deps[*]}"
|
log_error "Missing dependencies: ${missing_deps[*]}"
|
||||||
log_error "Please install the required packages and try again."
|
log_error "Please install the required packages and try again."
|
||||||
|
if [ "$USE_BROWSER" -eq 1 ]; then
|
||||||
|
log_error "For browser mode, install: pip3 install selenium"
|
||||||
|
fi
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@@ -141,18 +159,18 @@ parse_arguments() {
|
|||||||
show_help
|
show_help
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
MODE="$1"
|
MODE="$1"
|
||||||
SOURCE="$2"
|
SOURCE="$2"
|
||||||
OUTPUT_PDF="$3"
|
OUTPUT_PDF="$3"
|
||||||
shift 3
|
shift 3
|
||||||
|
|
||||||
# Validate mode
|
# Validate mode
|
||||||
if [ "$MODE" != "web" ] && [ "$MODE" != "local" ]; then
|
if [ "$MODE" != "web" ] && [ "$MODE" != "local" ]; then
|
||||||
log_error "Invalid mode: $MODE. Use 'web' or 'local'"
|
log_error "Invalid mode: $MODE. Use \'web\' or \'local\'"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Parse options
|
# Parse options
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@@ -162,6 +180,9 @@ parse_arguments() {
|
|||||||
--verbose)
|
--verbose)
|
||||||
VERBOSE=1
|
VERBOSE=1
|
||||||
;;
|
;;
|
||||||
|
--browser)
|
||||||
|
USE_BROWSER=1
|
||||||
|
;;
|
||||||
--max-depth)
|
--max-depth)
|
||||||
if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then
|
if [ $# -lt 2 ] || ! [[ "$2" =~ ^[0-9]+$ ]]; then
|
||||||
log_error "--max-depth requires a numeric argument"
|
log_error "--max-depth requires a numeric argument"
|
||||||
@@ -205,7 +226,7 @@ validate_inputs() {
|
|||||||
fi
|
fi
|
||||||
SOURCE="$(realpath "$SOURCE")"
|
SOURCE="$(realpath "$SOURCE")"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check if output directory exists
|
# Check if output directory exists
|
||||||
local output_dir="$(dirname "$OUTPUT_PDF")"
|
local output_dir="$(dirname "$OUTPUT_PDF")"
|
||||||
if [ ! -d "$output_dir" ]; then
|
if [ ! -d "$output_dir" ]; then
|
||||||
@@ -218,12 +239,100 @@ validate_inputs() {
|
|||||||
init_environment() {
|
init_environment() {
|
||||||
TEMP_DIR=$(mktemp -d)
|
TEMP_DIR=$(mktemp -d)
|
||||||
log_verbose "Created temporary directory: $TEMP_DIR"
|
log_verbose "Created temporary directory: $TEMP_DIR"
|
||||||
|
|
||||||
# Create log files
|
# Create log files
|
||||||
mkdir -p "$TEMP_DIR/logs"
|
mkdir -p "$TEMP_DIR/logs"
|
||||||
touch "$TEMP_DIR/logs/crawl_errors.log"
|
touch "$TEMP_DIR/logs/crawl_errors.log"
|
||||||
touch "$TEMP_DIR/logs/weasyprint_errors.log"
|
touch "$TEMP_DIR/logs/weasyprint_errors.log"
|
||||||
touch "$TEMP_DIR/logs/processing.log"
|
touch "$TEMP_DIR/logs/processing.log"
|
||||||
|
touch "$TEMP_DIR/logs/browser_errors.log"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Browser-based page fetcher
|
||||||
|
fetch_page_with_browser() {
|
||||||
|
local url="$1"
|
||||||
|
local output_file="$2"
|
||||||
|
|
||||||
|
log_verbose "Fetching $url with browser automation"
|
||||||
|
|
||||||
|
# Create Python script for browser automation
|
||||||
|
cat > "$TEMP_DIR/browser_fetch.py" << EOF
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.common.exceptions import TimeoutException, WebDriverException
|
||||||
|
|
||||||
|
def fetch_page(url, output_file, timeout=60):
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument(\'--headless\')
|
||||||
|
chrome_options.add_argument(\'--no-sandbox\')
|
||||||
|
chrome_options.add_argument(\'--disable-dev-shm-usage\')
|
||||||
|
chrome_options.add_argument(\'--disable-gpu\')
|
||||||
|
chrome_options.add_argument(\'--window-size=1920,1080\')
|
||||||
|
chrome_options.add_argument(\'--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\')
|
||||||
|
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
driver.set_page_load_timeout(timeout)
|
||||||
|
|
||||||
|
print(f"Loading page: {url}", file=sys.stderr)
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# Wait for page to load and bypass Cloudflare if needed
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Check if we\'re still on a Cloudflare challenge page
|
||||||
|
if "Just a moment" in driver.title or "cf-browser-verification" in driver.page_source:
|
||||||
|
print("Waiting for Cloudflare challenge to complete...", file=sys.stderr)
|
||||||
|
WebDriverWait(driver, timeout).until(
|
||||||
|
lambda d: "Just a moment" not in d.title and "cf-browser-verification" not in d.page_source
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the final page source
|
||||||
|
html_content = driver.page_source
|
||||||
|
|
||||||
|
with open(output_file, \'w\', encoding=\'utf-8\') as f:
|
||||||
|
f.write(html_content)
|
||||||
|
|
||||||
|
print(f"Successfully saved page to {output_file}", file=sys.stderr)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except TimeoutException:
|
||||||
|
print(f"Timeout while loading {url}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
except WebDriverException as e:
|
||||||
|
print(f"WebDriver error: {e}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Unexpected error: {e}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python3 browser_fetch.py <url> <output_file>", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
output_file = sys.argv[2]
|
||||||
|
|
||||||
|
success = fetch_page(url, output_file)
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Run the browser script
|
||||||
|
if python3 "$TEMP_DIR/browser_fetch.py" "$url" "$output_file" 2>> "$TEMP_DIR/logs/browser_errors.log"; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# URL normalization functions (for web mode)
|
# URL normalization functions (for web mode)
|
||||||
@@ -231,15 +340,16 @@ normalize_url() {
|
|||||||
local url="$1"
|
local url="$1"
|
||||||
local base_url="$2"
|
local base_url="$2"
|
||||||
local current_url="$3"
|
local current_url="$3"
|
||||||
|
|
||||||
if [[ "$url" =~ ^/ ]]; then
|
if [[ "$url" =~ ^/ ]]; then
|
||||||
url="$base_url$url"
|
url="$base_url$url"
|
||||||
elif [[ ! "$url" =~ ^https?:// ]]; then
|
elif [[ ! "$url" =~ ^https?:// ]]; then
|
||||||
|
# Corrected sed command: use a different delimiter for the s command
|
||||||
url=$(echo "$url" | sed 's|^\./||')
|
url=$(echo "$url" | sed 's|^\./||')
|
||||||
local current_dir="$(dirname "$current_url" | sed "s|^$base_url||")"
|
local current_dir="$(dirname "$current_url" | sed "s|^$base_url||")"
|
||||||
url="$base_url$current_dir/$url"
|
url="$base_url$current_dir/$url"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Clean up URL
|
# Clean up URL
|
||||||
url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//')
|
url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//')
|
||||||
echo "$url"
|
echo "$url"
|
||||||
@@ -249,7 +359,7 @@ url_to_filepath() {
|
|||||||
local url="$1"
|
local url="$1"
|
||||||
local base_url="$2"
|
local base_url="$2"
|
||||||
local domain="$3"
|
local domain="$3"
|
||||||
|
|
||||||
local path=$(echo "$url" | sed "s|^$base_url/||; s|/$||; s|/|_|g")
|
local path=$(echo "$url" | sed "s|^$base_url/||; s|/$||; s|/|_|g")
|
||||||
if [ -z "$path" ]; then
|
if [ -z "$path" ]; then
|
||||||
path="index"
|
path="index"
|
||||||
@@ -267,81 +377,271 @@ calculate_depth() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Function to download and process external resources
|
||||||
|
download_external_resources() {
|
||||||
|
local html_file="$1"
|
||||||
|
local base_url="$2"
|
||||||
|
local html_dir="$(dirname "$html_file")"
|
||||||
|
|
||||||
|
log_verbose "Processing external resources for $(basename "$html_file")"
|
||||||
|
|
||||||
|
# Create assets directory
|
||||||
|
mkdir -p "$html_dir/assets"
|
||||||
|
|
||||||
|
# Download CSS files
|
||||||
|
grep -o 'href=["\047][^"\047]*\.css[^"\047]*["\047]' "$html_file" 2>/dev/null | sed 's/.*href=["\047]\([^"\047]*\)["\047].*/\1/' | while read -r css_url; do
|
||||||
|
if [[ "$css_url" =~ ^https?:// ]]; then
|
||||||
|
local css_filename=$(basename "$css_url" | sed 's/\?.*$//')
|
||||||
|
local css_path="$html_dir/assets/$css_filename"
|
||||||
|
if curl -s -L --fail --max-time 10 "$css_url" > "$css_path" 2>/dev/null; then
|
||||||
|
log_verbose "Downloaded CSS: $css_url"
|
||||||
|
# Update HTML to use local CSS
|
||||||
|
sed -i "s|href=[\"']${css_url}[\"']|href=\"assets/$css_filename\"|g" "$html_file"
|
||||||
|
fi
|
||||||
|
elif [[ "$css_url" =~ ^/ ]]; then
|
||||||
|
local full_css_url="$base_url$css_url"
|
||||||
|
local css_filename=$(basename "$css_url" | sed 's/\?.*$//')
|
||||||
|
local css_path="$html_dir/assets/$css_filename"
|
||||||
|
if curl -s -L --fail --max-time 10 "$full_css_url" > "$css_path" 2>/dev/null; then
|
||||||
|
log_verbose "Downloaded CSS: $full_css_url"
|
||||||
|
sed -i "s|href=[\"']${css_url}[\"']|href=\"assets/$css_filename\"|g" "$html_file"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Download images
|
||||||
|
grep -o 'src=["\047][^"\047]*\.\(png\|jpg\|jpeg\|gif\|svg\|webp\)[^"\047]*["\047]' "$html_file" 2>/dev/null | sed 's/.*src=["\047]\([^"\047]*\)["\047].*/\1/' | while read -r img_url; do
|
||||||
|
if [[ "$img_url" =~ ^https?:// ]]; then
|
||||||
|
local img_filename=$(basename "$img_url" | sed 's/\?.*$//')
|
||||||
|
local img_path="$html_dir/assets/$img_filename"
|
||||||
|
if curl -s -L --fail --max-time 10 "$img_url" > "$img_path" 2>/dev/null; then
|
||||||
|
log_verbose "Downloaded image: $img_url"
|
||||||
|
sed -i "s|src=[\"']${img_url}[\"']|src=\"assets/$img_filename\"|g" "$html_file"
|
||||||
|
fi
|
||||||
|
elif [[ "$img_url" =~ ^/ ]]; then
|
||||||
|
local full_img_url="$base_url$img_url"
|
||||||
|
local img_filename=$(basename "$img_url" | sed 's/\?.*$//')
|
||||||
|
local img_path="$html_dir/assets/$img_filename"
|
||||||
|
if curl -s -L --fail --max-time 10 "$full_img_url" > "$img_path" 2>/dev/null; then
|
||||||
|
log_verbose "Downloaded image: $full_img_url"
|
||||||
|
sed -i "s|src=[\"']${img_url}[\"']|src=\"assets/$img_filename\"|g" "$html_file"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to inject CSS for better PDF rendering
|
||||||
|
inject_pdf_css() {
|
||||||
|
local html_file="$1"
|
||||||
|
|
||||||
|
# Create CSS for better PDF rendering
|
||||||
|
# Using a temporary file for the CSS content to avoid complex escaping issues with newlines in sed
|
||||||
|
local pdf_css_file="$TEMP_DIR/pdf_styles.css"
|
||||||
|
cat > "$pdf_css_file" << 'PDF_CSS_EOF'
|
||||||
|
<style>
|
||||||
|
/* PDF-specific styles */
|
||||||
|
@page {
|
||||||
|
size: A4;
|
||||||
|
margin: 2cm;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
line-height: 1.6;
|
||||||
|
color: #333;
|
||||||
|
max-width: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Image scaling */
|
||||||
|
img {
|
||||||
|
max-width: 100% !important;
|
||||||
|
height: auto !important;
|
||||||
|
display: block;
|
||||||
|
margin: 10px 0;
|
||||||
|
page-break-inside: avoid;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Table improvements */
|
||||||
|
table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
margin: 10px 0;
|
||||||
|
page-break-inside: avoid;
|
||||||
|
}
|
||||||
|
|
||||||
|
table, th, td {
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
}
|
||||||
|
|
||||||
|
th, td {
|
||||||
|
padding: 8px;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Code blocks */
|
||||||
|
pre, code {
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
padding: 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-family: monospace;
|
||||||
|
font-size: 0.9em;
|
||||||
|
overflow-wrap: break-word;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Headings */
|
||||||
|
h1, h2, h3, h4, h5, h6 {
|
||||||
|
page-break-after: avoid;
|
||||||
|
margin-top: 20px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Lists */
|
||||||
|
ul, ol {
|
||||||
|
margin: 10px 0;
|
||||||
|
padding-left: 30px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Links */
|
||||||
|
a {
|
||||||
|
color: #0066cc;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prevent widows and orphans */
|
||||||
|
p {
|
||||||
|
orphans: 3;
|
||||||
|
widows: 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hide navigation and other non-content elements */
|
||||||
|
nav, .nav, .navigation, .menu, .sidebar, .footer, .header, .ads, .advertisement {
|
||||||
|
display: none !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure content flows properly */
|
||||||
|
.content, .main, .article, .post {
|
||||||
|
width: 100% !important;
|
||||||
|
max-width: none !important;
|
||||||
|
margin: 0 !important;
|
||||||
|
padding: 0 !important;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
PDF_CSS_EOF
|
||||||
|
|
||||||
|
# Insert CSS before closing head tag or at the beginning of body
|
||||||
|
if grep -q "</head>" "$html_file"; then
|
||||||
|
sed -i "/\/head>/i\n$(cat "$pdf_css_file")\n" "$html_file"
|
||||||
|
else
|
||||||
|
sed -i "/body>/a\n$(cat "$pdf_css_file")\n" "$html_file"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Web crawling function
|
# Web crawling function
|
||||||
crawl_website() {
|
crawl_website() {
|
||||||
local start_url="$SOURCE"
|
local start_url="$SOURCE"
|
||||||
local domain=$(echo "$start_url" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||')
|
local domain=$(echo "$start_url" | grep -oE "(https?://[^/]+)" | sed 's|https\?://||')
|
||||||
local base_url=$(echo "$start_url" | grep -oE 'https?://[^/]+')
|
local base_url=$(echo "$start_url" | grep -oE 'https?://[^/]+')
|
||||||
|
|
||||||
log_info "Starting web crawl from: $start_url"
|
log_info "Starting web crawl from: $start_url"
|
||||||
log_info "Domain: $domain, Max depth: $MAX_DEPTH, Max pages: $MAX_PAGES"
|
log_info "Domain: $domain, Max depth: $MAX_DEPTH, Max pages: $MAX_PAGES"
|
||||||
|
if [ "$USE_BROWSER" -eq 1 ]; then
|
||||||
|
log_info "Using browser automation to bypass Cloudflare protection"
|
||||||
|
fi
|
||||||
|
|
||||||
local queue_file="$TEMP_DIR/queue.txt"
|
local queue_file="$TEMP_DIR/queue.txt"
|
||||||
local visited_file="$TEMP_DIR/visited.txt"
|
local visited_file="$TEMP_DIR/visited.txt"
|
||||||
local crawl_log="$TEMP_DIR/logs/crawl_errors.log"
|
local crawl_log="$TEMP_DIR/logs/crawl_errors.log"
|
||||||
|
|
||||||
echo "$start_url" > "$queue_file"
|
echo "$start_url" > "$queue_file"
|
||||||
touch "$visited_file"
|
touch "$visited_file"
|
||||||
|
|
||||||
mkdir -p "$TEMP_DIR/$domain"
|
mkdir -p "$TEMP_DIR/$domain"
|
||||||
|
|
||||||
local page_count=0
|
local page_count=0
|
||||||
|
|
||||||
while [ -s "$queue_file" ] && [ "$page_count" -lt "$MAX_PAGES" ]; do
|
while [ -s "$queue_file" ] && [ "$page_count" -lt "$MAX_PAGES" ]; do
|
||||||
local current_url=$(head -n 1 "$queue_file")
|
local current_url=$(head -n 1 "$queue_file")
|
||||||
sed -i '1d' "$queue_file"
|
sed -i '1d' "$queue_file"
|
||||||
|
|
||||||
if grep -Fx "$current_url" "$visited_file" >/dev/null; then
|
if grep -Fx "$current_url" "$visited_file" >/dev/null; then
|
||||||
log_verbose "Skipping already visited: $current_url"
|
log_verbose "Skipping already visited: $current_url"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "$current_url" >> "$visited_file"
|
echo "$current_url" >> "$visited_file"
|
||||||
|
|
||||||
local depth=$(calculate_depth "$current_url")
|
local depth=$(calculate_depth "$current_url")
|
||||||
log_verbose "Processing $current_url (depth $depth)"
|
log_verbose "Processing $current_url (depth $depth)"
|
||||||
|
|
||||||
if [ "$depth" -gt "$MAX_DEPTH" ]; then
|
if [ "$depth" -gt "$MAX_DEPTH" ]; then
|
||||||
log_verbose "Skipping (depth $depth > $MAX_DEPTH): $current_url"
|
log_verbose "Skipping (depth $depth > $MAX_DEPTH): $current_url"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local file_path=$(url_to_filepath "$current_url" "$base_url" "$domain")
|
local file_path=$(url_to_filepath "$current_url" "$base_url" "$domain")
|
||||||
log_verbose "Downloading $current_url to $file_path"
|
log_verbose "Downloading $current_url to $file_path"
|
||||||
|
|
||||||
|
local download_success=0
|
||||||
|
|
||||||
if curl -s -L --fail --retry 3 --retry-delay 2 --max-time 30 "$current_url" > "$file_path" 2>> "$crawl_log"; then
|
# Try browser method first if enabled
|
||||||
|
if [ "$USE_BROWSER" -eq 1 ]; then
|
||||||
|
if fetch_page_with_browser "$current_url" "$file_path"; then
|
||||||
|
download_success=1
|
||||||
|
log_verbose "Successfully downloaded with browser: $current_url"
|
||||||
|
else
|
||||||
|
log_warning "Browser download failed for $current_url, trying curl"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback to curl if browser failed or not enabled
|
||||||
|
if [ "$download_success" -eq 0 ]; then
|
||||||
|
if curl -s -L --fail --retry 3 --retry-delay 2 --max-time 30 "$current_url" > "$file_path" 2>> "$crawl_log"; then
|
||||||
|
download_success=1
|
||||||
|
log_verbose "Successfully downloaded with curl: $current_url"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$download_success" -eq 1 ]; then
|
||||||
((page_count++))
|
((page_count++))
|
||||||
log_info "Crawled page $page_count: $current_url"
|
log_info "Crawled page $page_count: $current_url"
|
||||||
|
|
||||||
# Extract and process links
|
# Download external resources and inject PDF CSS
|
||||||
local links=$(grep -o '<a[^>]*href=["\047][^"\047]*["\047]' "$file_path" 2>/dev/null | sed 's/.*href=["\047]\([^"\047]*\)["\047].*/\1/' | sort -u)
|
download_external_resources "$file_path" "$base_url"
|
||||||
|
inject_pdf_css "$file_path"
|
||||||
for link in $links; do
|
|
||||||
if [ -z "$link" ] || [[ "$link" =~ ^(javascript:|mailto:|#|data:) ]]; then
|
# Extract and process links (only if not at max depth)
|
||||||
continue
|
if [ "$depth" -lt "$MAX_DEPTH" ]; then
|
||||||
fi
|
local links=$(grep -o '<a[^>]*href=["\047][^"\047]*["\047]' "$file_path" 2>/dev/null | sed 's/.*href=["\047]\([^"\047]*\)["\047].*/\1/' | sort -u)
|
||||||
|
|
||||||
local normalized_link=$(normalize_url "$link" "$base_url" "$current_url")
|
for link in $links; do
|
||||||
|
if [ -z "$link" ] || [[ "$link" =~ ^(javascript:|mailto:|#|data:) ]]; then
|
||||||
if [[ "$normalized_link" =~ ^https?://$domain ]]; then
|
continue
|
||||||
local local_path=$(url_to_filepath "$normalized_link" "$base_url" "$domain")
|
|
||||||
local relative_path=$(realpath --relative-to="$(dirname "$file_path")" "$local_path" 2>/dev/null || echo "$local_path")
|
|
||||||
|
|
||||||
# Update link in HTML file
|
|
||||||
local escaped_link=$(echo "$link" | sed 's/[&]/\\&/g; s/[[\.*^$(){}+?|\\]/\\&/g')
|
|
||||||
sed -i "s|href=[\"']${escaped_link}[\"']|href=\"$relative_path\"|g" "$file_path" 2>> "$crawl_log"
|
|
||||||
|
|
||||||
if ! grep -Fx "$normalized_link" "$visited_file" >/dev/null; then
|
|
||||||
echo "$normalized_link" >> "$queue_file"
|
|
||||||
log_verbose "Queued: $normalized_link"
|
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
done
|
local normalized_link=$(normalize_url "$link" "$base_url" "$current_url")
|
||||||
|
|
||||||
|
if [[ "$normalized_link" =~ ^https?://$domain ]]; then
|
||||||
|
local local_path=$(url_to_filepath "$normalized_link" "$base_url" "$domain")
|
||||||
|
local relative_path=$(realpath --relative-to="$(dirname "$file_path")" "$local_path" 2>/dev/null || echo "$local_path")
|
||||||
|
|
||||||
|
# Update link in HTML file
|
||||||
|
# Using a different delimiter for sed to avoid issues with '/' in URLs
|
||||||
|
local escaped_link=$(echo "$link" | sed 's/[&/\.]/\\&/g') # Escape &, /, and .
|
||||||
|
sed -i "s@href=[\"']${escaped_link}[\"']@href=\"$relative_path\"@g" "$file_path" 2>> "$crawl_log"
|
||||||
|
|
||||||
|
if ! grep -Fx "$normalized_link" "$visited_file" >/dev/null; then
|
||||||
|
echo "$normalized_link" >> "$queue_file"
|
||||||
|
log_verbose "Queued: $normalized_link"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
log_warning "Failed to download $current_url"
|
log_warning "Failed to download $current_url"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
log_success "Crawled $page_count pages"
|
log_success "Crawled $page_count pages"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -349,34 +649,38 @@ crawl_website() {
|
|||||||
process_local_html() {
|
process_local_html() {
|
||||||
local html_dir="$SOURCE"
|
local html_dir="$SOURCE"
|
||||||
log_info "Processing local HTML files from: $html_dir"
|
log_info "Processing local HTML files from: $html_dir"
|
||||||
|
|
||||||
# Find all HTML files recursively
|
# Find all HTML files recursively
|
||||||
local html_files=()
|
local html_files=()
|
||||||
while IFS= read -r -d '' file; do
|
while IFS= read -r -d '' file; do
|
||||||
html_files+=("$file")
|
html_files+=("$file")
|
||||||
done < <(find "$html_dir" -type f \( -name "*.html" -o -name "*.htm" \) -print0)
|
done < <(find "$html_dir" -type f \( -name "*.html" -o -name "*.htm" \) -print0)
|
||||||
|
|
||||||
if [ ${#html_files[@]} -eq 0 ]; then
|
if [ ${#html_files[@]} -eq 0 ]; then
|
||||||
log_error "No HTML files found in $html_dir"
|
log_error "No HTML files found in $html_dir"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_info "Found ${#html_files[@]} HTML files"
|
log_info "Found ${#html_files[@]} HTML files"
|
||||||
|
|
||||||
# Copy HTML files to temp directory, maintaining structure
|
# Copy HTML files to temp directory, maintaining structure
|
||||||
local temp_html_dir="$TEMP_DIR/html"
|
local temp_html_dir="$TEMP_DIR/html"
|
||||||
mkdir -p "$temp_html_dir"
|
mkdir -p "$temp_html_dir"
|
||||||
|
|
||||||
for html_file in "${html_files[@]}"; do
|
for html_file in "${html_files[@]}"; do
|
||||||
local rel_path=$(realpath --relative-to="$html_dir" "$html_file")
|
local rel_path=$(realpath --relative-to="$html_dir" "$html_file")
|
||||||
local dest_path="$temp_html_dir/$rel_path"
|
local dest_path="$temp_html_dir/$rel_path"
|
||||||
local dest_dir=$(dirname "$dest_path")
|
local dest_dir=$(dirname "$dest_path")
|
||||||
|
|
||||||
mkdir -p "$dest_dir"
|
mkdir -p "$dest_dir"
|
||||||
cp "$html_file" "$dest_path"
|
cp "$html_file" "$dest_path"
|
||||||
|
|
||||||
|
# Inject PDF CSS for better rendering
|
||||||
|
inject_pdf_css "$dest_path"
|
||||||
|
|
||||||
log_verbose "Copied: $rel_path"
|
log_verbose "Copied: $rel_path"
|
||||||
done
|
done
|
||||||
|
|
||||||
# Also copy any associated assets (CSS, JS, images)
|
# Also copy any associated assets (CSS, JS, images)
|
||||||
local asset_extensions=("css" "js" "png" "jpg" "jpeg" "gif" "svg" "ico" "woff" "woff2" "ttf" "eot")
|
local asset_extensions=("css" "js" "png" "jpg" "jpeg" "gif" "svg" "ico" "woff" "woff2" "ttf" "eot")
|
||||||
for ext in "${asset_extensions[@]}"; do
|
for ext in "${asset_extensions[@]}"; do
|
||||||
@@ -384,13 +688,13 @@ process_local_html() {
|
|||||||
local rel_path=$(realpath --relative-to="$html_dir" "$file")
|
local rel_path=$(realpath --relative-to="$html_dir" "$file")
|
||||||
local dest_path="$temp_html_dir/$rel_path"
|
local dest_path="$temp_html_dir/$rel_path"
|
||||||
local dest_dir=$(dirname "$dest_path")
|
local dest_dir=$(dirname "$dest_path")
|
||||||
|
|
||||||
mkdir -p "$dest_dir"
|
mkdir -p "$dest_dir"
|
||||||
cp "$file" "$dest_path"
|
cp "$file" "$dest_path"
|
||||||
log_verbose "Copied asset: $rel_path"
|
log_verbose "Copied asset: $rel_path"
|
||||||
done < <(find "$html_dir" -type f -name "*.${ext}" -print0 2>/dev/null)
|
done < <(find "$html_dir" -type f -name "*.${ext}" -print0 2>/dev/null)
|
||||||
done
|
done
|
||||||
|
|
||||||
log_success "Processed local HTML files"
|
log_success "Processed local HTML files"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -400,9 +704,9 @@ try_alternative_conversion() {
|
|||||||
local pdf_file="$2"
|
local pdf_file="$2"
|
||||||
local html_dir="$3"
|
local html_dir="$3"
|
||||||
local weasyprint_log="$4"
|
local weasyprint_log="$4"
|
||||||
|
|
||||||
log_verbose "Trying alternative conversion for $(basename "$html_file")"
|
log_verbose "Trying alternative conversion for $(basename "$html_file")"
|
||||||
|
|
||||||
# Try with simpler options (no base URL, different media type)
|
# Try with simpler options (no base URL, different media type)
|
||||||
if timeout 30 weasyprint \
|
if timeout 30 weasyprint \
|
||||||
--media-type screen \
|
--media-type screen \
|
||||||
@@ -410,13 +714,13 @@ try_alternative_conversion() {
|
|||||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Try with minimal options
|
# Try with minimal options
|
||||||
if timeout 30 weasyprint \
|
if timeout 30 weasyprint \
|
||||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -458,9 +762,12 @@ generate_pdfs() {
|
|||||||
local start_time=$(date +%s)
|
local start_time=$(date +%s)
|
||||||
|
|
||||||
log_verbose "Running WeasyPrint on: $html_file"
|
log_verbose "Running WeasyPrint on: $html_file"
|
||||||
|
|
||||||
|
# Use optimized WeasyPrint settings for better CSS and image handling
|
||||||
if timeout "$WEASYPRINT_TIMEOUT" weasyprint \
|
if timeout "$WEASYPRINT_TIMEOUT" weasyprint \
|
||||||
--base-url "file://$html_dir/" \
|
--base-url "file://$html_dir/" \
|
||||||
--media-type print \
|
--media-type screen \
|
||||||
|
--presentational-hints \
|
||||||
--pdf-version 1.7 \
|
--pdf-version 1.7 \
|
||||||
--verbose \
|
--verbose \
|
||||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
||||||
@@ -536,151 +843,50 @@ copy_debug_files() {
|
|||||||
"crawl_errors.log"
|
"crawl_errors.log"
|
||||||
"weasyprint_errors.log"
|
"weasyprint_errors.log"
|
||||||
"processing.log"
|
"processing.log"
|
||||||
|
"browser_errors.log"
|
||||||
)
|
)
|
||||||
|
|
||||||
for file in "${debug_files[@]}"; do
|
for file in "${debug_files[@]}"; do
|
||||||
if [ -f "$TEMP_DIR/logs/$file" ]; then
|
if [ -f "$TEMP_DIR/logs/$file" ]; then
|
||||||
cp "$TEMP_DIR/logs/$file" "./$file"
|
cp "$TEMP_DIR/logs/$file" "./$file"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Copy other useful files
|
# Copy other useful files
|
||||||
if [ -f "$TEMP_DIR/queue.txt" ]; then
|
if [ -f "$TEMP_DIR/queue.txt" ]; then
|
||||||
cp "$TEMP_DIR/queue.txt" "./queue.log"
|
cp "$TEMP_DIR/queue.txt" "./queue.log"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -f "$TEMP_DIR/visited.txt" ]; then
|
if [ -f "$TEMP_DIR/visited.txt" ]; then
|
||||||
cp "$TEMP_DIR/visited.txt" "./visited.log"
|
cp "$TEMP_DIR/visited.txt" "./visited.log"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_info "Debug files saved to current directory"
|
log_info "Debug files saved to current directory"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Main function
|
# Main function
|
||||||
main() {
|
main() {
|
||||||
log_info "$SCRIPT_NAME v$VERSION starting..."
|
log_info "$SCRIPT_NAME v$VERSION starting..."
|
||||||
|
|
||||||
parse_arguments "$@"
|
parse_arguments "$@"
|
||||||
validate_inputs
|
validate_inputs
|
||||||
check_dependencies
|
check_dependencies
|
||||||
init_environment
|
init_environment
|
||||||
|
|
||||||
if [ "$MODE" = "web" ]; then
|
if [ "$MODE" = "web" ]; then
|
||||||
crawl_website
|
crawl_website
|
||||||
elif [ "$MODE" = "local" ]; then
|
elif [ "$MODE" = "local" ]; then
|
||||||
process_local_html
|
process_local_html
|
||||||
fi
|
fi
|
||||||
|
|
||||||
generate_pdfs
|
generate_pdfs
|
||||||
copy_debug_files
|
copy_debug_files
|
||||||
cleanup_temp
|
cleanup_temp
|
||||||
|
|
||||||
log_success "Process completed successfully!"
|
log_success "Process completed successfully!"
|
||||||
log_info "Output: $OUTPUT_PDF"
|
log_info "Output: $OUTPUT_PDF"
|
||||||
|
|
||||||
# Sort files for consistent processing order
|
|
||||||
IFS=$'\n' html_files=($(sort <<<"${html_files[*]}"))
|
|
||||||
unset IFS
|
|
||||||
log_info "Converting ${#html_files[@]} HTML files to PDF"
|
|
||||||
|
|
||||||
local success_count=0
|
|
||||||
local failed_count=0
|
|
||||||
local current_file=0
|
|
||||||
local weasyprint_log="$TEMP_DIR/logs/weasyprint_errors.log"
|
|
||||||
|
|
||||||
for html_file in "${html_files[@]}"; do
|
|
||||||
((current_file++))
|
|
||||||
local html_dir=$(dirname "$html_file")
|
|
||||||
local pdf_file="${html_file%.html}.pdf"
|
|
||||||
pdf_file="${pdf_file%.htm}.pdf"
|
|
||||||
local filename=$(basename "$html_file")
|
|
||||||
|
|
||||||
log_info "[$current_file/${#html_files[@]}] Converting $filename to PDF..."
|
|
||||||
|
|
||||||
# Add progress indicators and better error handling
|
|
||||||
local start_time=$(date +%s)
|
|
||||||
|
|
||||||
if timeout "$WEASYPRINT_TIMEOUT" weasyprint \
|
|
||||||
--base-url "file://$html_dir/" \
|
|
||||||
--media-type print \
|
|
||||||
--pdf-version 1.7 \
|
|
||||||
--verbose \
|
|
||||||
"$html_file" "$pdf_file" 2>> "$weasyprint_log"; then
|
|
||||||
|
|
||||||
local end_time=$(date +%s)
|
|
||||||
local duration=$((end_time - start_time))
|
|
||||||
|
|
||||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
|
||||||
pdf_files+=("$pdf_file")
|
|
||||||
((success_count++))
|
|
||||||
log_success "✓ $filename converted successfully (${duration}s)"
|
|
||||||
else
|
|
||||||
log_warning "✗ $filename: PDF file is empty or missing"
|
|
||||||
((failed_count++))
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
local end_time=$(date +%s)
|
|
||||||
local duration=$((end_time - start_time))
|
|
||||||
|
|
||||||
if [ $duration -ge $WEASYPRINT_TIMEOUT ]; then
|
|
||||||
log_warning "✗ $filename: Conversion timed out after ${WEASYPRINT_TIMEOUT}s"
|
|
||||||
else
|
|
||||||
log_warning "✗ $filename: Conversion failed (${duration}s)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Try alternative conversion method
|
|
||||||
log_verbose "Attempting alternative conversion for $filename"
|
|
||||||
if try_alternative_conversion "$html_file" "$pdf_file" "$html_dir" "$weasyprint_log"; then
|
|
||||||
if [ -f "$pdf_file" ] && [ -s "$pdf_file" ]; then
|
|
||||||
pdf_files+=("$pdf_file")
|
|
||||||
((success_count++))
|
|
||||||
log_success "✓ $filename converted with alternative method"
|
|
||||||
else
|
|
||||||
((failed_count++))
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
((failed_count++))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Log the last few lines of the error for this file
|
|
||||||
echo "=== Error for $filename ===" >> "$weasyprint_log"
|
|
||||||
tail -n 10 "$weasyprint_log" >> "$weasyprint_log"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Show progress
|
|
||||||
if [ $((current_file % 10)) -eq 0 ] || [ $current_file -eq ${#html_files[@]} ]; then
|
|
||||||
log_info "Progress: $current_file/${#html_files[@]} files processed (Success: $success_count, Failed: $failed_count)"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ ${#pdf_files[@]} -eq 0 ]; then
|
|
||||||
log_error "No PDFs were generated successfully"
|
|
||||||
log_error "Check weasyprint_errors.log for details"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_success "Generated $success_count PDFs successfully"
|
|
||||||
if [ $failed_count -gt 0 ]; then
|
|
||||||
log_warning "$failed_count files failed to convert"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Merge PDFs
|
|
||||||
log_info "Merging ${#pdf_files[@]} PDFs into $OUTPUT_PDF..."
|
|
||||||
|
|
||||||
if pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" 2>> "$TEMP_DIR/logs/processing.log"; then
|
|
||||||
log_success "Successfully created $OUTPUT_PDF"
|
|
||||||
|
|
||||||
# Show final file info
|
|
||||||
if command -v du &> /dev/null; then
|
|
||||||
local file_size=$(du -h "$OUTPUT_PDF" | cut -f1)
|
|
||||||
log_info "Final PDF size: $file_size"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_error "Failed to merge PDFs"
|
|
||||||
log_error "Check processing.log for details"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Run main function with all arguments
|
# Run main function with all arguments
|
||||||
main "$@"
|
main "$@"
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user