From 2d60443be4efaa08d4a372ba57f16e576870bed6 Mon Sep 17 00:00:00 2001 From: Amoelle Date: Mon, 2 Jun 2025 10:09:53 +0300 Subject: [PATCH] init --- .gitignore | 1 + README.md | 1 + crawl.sh | 219 ++++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 161 insertions(+), 60 deletions(-) create mode 100644 README.md diff --git a/.gitignore b/.gitignore index a136337..5eba6c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.pdf +*.log diff --git a/README.md b/README.md new file mode 100644 index 0000000..7426817 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Vibe coded shitty tool to crawl websites into pdf files. diff --git a/crawl.sh b/crawl.sh index 1ce6bc9..a646435 100755 --- a/crawl.sh +++ b/crawl.sh @@ -1,28 +1,20 @@ #!/bin/bash -# Usage: ./crawl_to_pdf.sh -# Example: ./crawl_to_pdf.sh https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf - -# Check if required arguments are provided -if [ $# -ne 3 ]; then - echo "Usage: $0 " - echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf" +if [ $# -lt 2 ]; then + echo "Usage: $0 [--keep-temp]" + echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page wikipedia.pdf --keep-temp" exit 1 fi START_URL="$1" -MAX_DEPTH="$2" -OUTPUT_PDF="$3" - -# Validate max_depth is a positive integer -if ! [[ "$MAX_DEPTH" =~ ^[0-9]+$ ]] || [ "$MAX_DEPTH" -lt 1 ]; then - echo "Error: max_depth must be a positive integer." - exit 1 +OUTPUT_PDF="$2" +KEEP_TEMP=0 +if [ "$3" = "--keep-temp" ]; then + KEEP_TEMP=1 fi -# Check if wget, weasyprint, and pdfunite are installed -if ! command -v wget &> /dev/null; then - echo "Error: wget is not installed." +if ! command -v curl &> /dev/null; then + echo "Error: curl is not installed." exit 1 fi if ! command -v weasyprint &> /dev/null; then @@ -34,86 +26,174 @@ if ! command -v pdfunite &> /dev/null; then exit 1 fi -# Create a temporary directory for downloads TEMP_DIR=$(mktemp -d) echo "Working in temporary directory: $TEMP_DIR" -# Extract domain from URL to restrict crawling -DOMAIN=$(echo "$START_URL" | grep -oP '(?<=://)[^/]+') +DOMAIN=$(echo "$START_URL" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||') +BASE_URL=$(echo "$START_URL" | grep -oE 'https?://[^/]+') -# Crawl the website using wget -echo "Crawling $START_URL (max depth: $MAX_DEPTH)..." -wget \ - --recursive \ - --level="$MAX_DEPTH" \ - --convert-links \ - --html-extension \ - --no-parent \ - --domains="$DOMAIN" \ - --no-verbose \ - --directory-prefix="$TEMP_DIR" \ - "$START_URL" +QUEUE_FILE="$TEMP_DIR/queue.txt" +VISITED_FILE="$TEMP_DIR/visited.txt" +CRAWL_ERROR_LOG="$TEMP_DIR/crawl_errors.log" +WEASYPRINT_ERROR_LOG="$TEMP_DIR/weasyprint_errors.log" +echo "$START_URL" > "$QUEUE_FILE" +touch "$VISITED_FILE" "$CRAWL_ERROR_LOG" "$WEASYPRINT_ERROR_LOG" -if [ $? -ne 0 ]; then - echo "Error: Failed to crawl $START_URL." - rm -rf "$TEMP_DIR" - exit 1 -fi +MAX_DEPTH=2 +MAX_PAGES=100 +PAGE_COUNT=0 + +mkdir -p "$TEMP_DIR/$DOMAIN" + +normalize_url() { + local url="$1" + local base="$2" + if [[ "$url" =~ ^/ ]]; then + url="$BASE_URL$url" + elif [[ ! "$url" =~ ^https?:// ]]; then + url=$(echo "$url" | sed 's|^\./||') + url="$BASE_URL/$(dirname "$base" | sed "s|^$BASE_URL||")/$url" + fi + url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//') + echo "$url" +} + +url_to_filepath() { + local url="$1" + local path=$(echo "$url" | sed "s|^$BASE_URL/||; s|/$||; s|/|_|g") + if [ -z "$path" ]; then + path="index" + fi + echo "$TEMP_DIR/$DOMAIN/$path.html" +} + +calculate_depth() { + local url="$1" + local path=$(echo "$url" | sed "s|^https\?://[^/]*||; s|^/||; s|/$||") + if [ -z "$path" ]; then + echo 0 + else + echo "$path" | awk -F'/' '{print NF}' + fi +} + +echo "Crawling $START_URL (links within $DOMAIN, max depth $MAX_DEPTH, max pages $MAX_PAGES)..." +while [ -s "$QUEUE_FILE" ] && [ "$PAGE_COUNT" -lt "$MAX_PAGES" ]; do + CURRENT_URL=$(head -n 1 "$QUEUE_FILE") + sed -i '1d' "$QUEUE_FILE" + + if grep -Fx "$CURRENT_URL" "$VISITED_FILE" >/dev/null; then + echo "Skipping already visited: $CURRENT_URL" >> "$CRAWL_ERROR_LOG" + continue + fi + + echo "$CURRENT_URL" >> "$VISITED_FILE" + + DEPTH=$(calculate_depth "$CURRENT_URL") + echo "Processing $CURRENT_URL (depth $DEPTH)" >> "$CRAWL_ERROR_LOG" + if [ "$DEPTH" -gt "$MAX_DEPTH" ]; then + echo "Skipping (depth $DEPTH > $MAX_DEPTH): $CURRENT_URL" >> "$CRAWL_ERROR_LOG" + continue + fi + + FILE_PATH=$(url_to_filepath "$CURRENT_URL") + echo "Downloading $CURRENT_URL to $FILE_PATH..." + curl -s -L --fail --retry 3 --retry-delay 2 "$CURRENT_URL" > "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG" + if [ $? -ne 0 ]; then + echo "Warning: Failed to download $CURRENT_URL" >> "$CRAWL_ERROR_LOG" + continue + fi + + ((PAGE_COUNT++)) + echo "Crawled page $PAGE_COUNT: $CURRENT_URL" + + LINKS=$(grep -o ']*href=["'"'"'][^"'"'"']*["'"'"']' "$FILE_PATH" | sed 's/.*href=["'"'"']\([^"'"'"']*\)["'"'"'].*/\1/' | sort -u) + for LINK in $LINKS; do + if [ -z "$LINK" ] || [[ "$LINK" =~ ^(javascript:|mailto:|#|data:) ]]; then + continue + fi + + NORMALIZED_LINK=$(normalize_url "$LINK" "$CURRENT_URL") + + if [[ "$NORMALIZED_LINK" =~ ^https?://$DOMAIN ]]; then + LOCAL_PATH=$(url_to_filepath "$NORMALIZED_LINK") + RELATIVE_PATH=$(realpath --relative-to="$(dirname "$FILE_PATH")" "$LOCAL_PATH" 2>/dev/null || echo "$LOCAL_PATH") + + ESCAPED_LINK=$(echo "$LINK" | sed 's/[&]/\\&/g') + sed -i "s|href=[\"']${ESCAPED_LINK}[\"']|href=\"$RELATIVE_PATH\"|g" "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG" + + if ! grep -Fx "$NORMALIZED_LINK" "$VISITED_FILE" >/dev/null; then + echo "$NORMALIZED_LINK" >> "$QUEUE_FILE" + echo "Queued: $NORMALIZED_LINK" >> "$CRAWL_ERROR_LOG" + fi + fi + done +done -# Generate file_list.txt with all HTML files echo "Generating list of HTML files..." -find "$TEMP_DIR" -type f -name "*.html" > "$TEMP_DIR/file_list.txt" +find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) > "$TEMP_DIR/file_list.txt" -# Check if file_list.txt was generated successfully if [ ! -s "$TEMP_DIR/file_list.txt" ]; then - echo "Error: No HTML files found." - rm -rf "$TEMP_DIR" + echo "Error: No HTML files found. Check crawl_errors.log for details." + cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" + cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" + cp "$QUEUE_FILE" "./queue.log" 2>/dev/null + cp "$VISITED_FILE" "./visited.log" 2>/dev/null + cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null + if [ $KEEP_TEMP -eq 0 ]; then + rm -rf "$TEMP_DIR" + else + echo "Temporary directory preserved: $TEMP_DIR" + fi exit 1 fi echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files." -# Array to store generated PDF files pdf_files=() -# Convert each HTML file to PDF while IFS= read -r html_file; do - # Get the directory containing the HTML file html_dir=$(dirname "$html_file") - # Check if the HTML file exists if [ ! -f "$html_file" ]; then - echo "Warning: $html_file not found, skipping." + echo "Warning: $html_file not found, skipping." >> "$WEASYPRINT_ERROR_LOG" continue fi - # Define the output PDF for each HTML file pdf_file="${html_file%.html}.pdf" + pdf_file="${pdf_file%.htm}.pdf" - # Convert HTML to PDF using weasyprint echo "Converting $html_file to PDF..." weasyprint \ --base-url "file://$html_dir/" \ --media-type print \ - "$html_file" "$pdf_file" + "$html_file" "$pdf_file" 2>> "$WEASYPRINT_ERROR_LOG" if [ $? -ne 0 ]; then - echo "Warning: Failed to convert $html_file to PDF, skipping." + echo "Warning: Failed to convert $html_file to PDF, see weasyprint_errors.log for details." >> "$WEASYPRINT_ERROR_LOG" continue fi - # Add the PDF to the array pdf_files+=("$pdf_file") done < "$TEMP_DIR/file_list.txt" -# Check if any PDFs were generated if [ ${#pdf_files[@]} -eq 0 ]; then - echo "Error: No PDFs were generated." - rm -rf "$TEMP_DIR" + echo "Error: No PDFs were generated. Check weasyprint_errors.log for details." + cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" + cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" + cp "$QUEUE_FILE" "./queue.log" 2>/dev/null + cp "$VISITED_FILE" "./visited.log" 2>/dev/null + cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null + if [ $KEEP_TEMP -eq 0 ]; then + rm -rf "$TEMP_DIR" + else + echo "Temporary directory preserved: $TEMP_DIR" + fi exit 1 fi -# Merge the PDFs into a single file +echo "Generated ${#pdf_files[@]} PDFs." + echo "Merging PDFs into $OUTPUT_PDF..." pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" @@ -121,10 +201,29 @@ if [ $? -eq 0 ]; then echo "Successfully created $OUTPUT_PDF" else echo "Error merging PDFs." - rm -rf "$TEMP_DIR" + cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" + cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" + cp "$QUEUE_FILE" "./queue.log" 2>/dev/null + cp "$VISITED_FILE" "./visited.log" 2>/dev/null + cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null + if [ $KEEP_TEMP -eq 0 ]; then + rm -rf "$TEMP_DIR" + else + echo "Temporary directory preserved: $TEMP_DIR" + fi exit 1 fi -# Clean up temporary directory -rm -rf "$TEMP_DIR" -echo "Cleaned up temporary files." +cp "$CRAWL_ERROR_LOG" "./crawl_errors.log" +cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log" +cp "$QUEUE_FILE" "./queue.log" 2>/dev/null +cp "$VISITED_FILE" "./visited.log" 2>/dev/null +cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null +echo "Debug files saved: crawl_errors.log, weasyprint_errors.log, queue.log, visited.log, file_list.log" + +if [ $KEEP_TEMP -eq 0 ]; then + rm -rf "$TEMP_DIR" + echo "Cleaned up temporary files." +else + echo "Temporary directory preserved: $TEMP_DIR" +fi