commit 69feebfd44ea54a9d938f1896d02f37d614f3380 Author: Amoelle Date: Wed May 7 22:47:14 2025 +0300 init diff --git a/crawl.sh b/crawl.sh new file mode 100755 index 0000000..1ce6bc9 --- /dev/null +++ b/crawl.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# Usage: ./crawl_to_pdf.sh +# Example: ./crawl_to_pdf.sh https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf + +# Check if required arguments are provided +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf" + exit 1 +fi + +START_URL="$1" +MAX_DEPTH="$2" +OUTPUT_PDF="$3" + +# Validate max_depth is a positive integer +if ! [[ "$MAX_DEPTH" =~ ^[0-9]+$ ]] || [ "$MAX_DEPTH" -lt 1 ]; then + echo "Error: max_depth must be a positive integer." + exit 1 +fi + +# Check if wget, weasyprint, and pdfunite are installed +if ! command -v wget &> /dev/null; then + echo "Error: wget is not installed." + exit 1 +fi +if ! command -v weasyprint &> /dev/null; then + echo "Error: weasyprint is not installed." + exit 1 +fi +if ! command -v pdfunite &> /dev/null; then + echo "Error: pdfunite is not installed." + exit 1 +fi + +# Create a temporary directory for downloads +TEMP_DIR=$(mktemp -d) +echo "Working in temporary directory: $TEMP_DIR" + +# Extract domain from URL to restrict crawling +DOMAIN=$(echo "$START_URL" | grep -oP '(?<=://)[^/]+') + +# Crawl the website using wget +echo "Crawling $START_URL (max depth: $MAX_DEPTH)..." +wget \ + --recursive \ + --level="$MAX_DEPTH" \ + --convert-links \ + --html-extension \ + --no-parent \ + --domains="$DOMAIN" \ + --no-verbose \ + --directory-prefix="$TEMP_DIR" \ + "$START_URL" + +if [ $? -ne 0 ]; then + echo "Error: Failed to crawl $START_URL." + rm -rf "$TEMP_DIR" + exit 1 +fi + +# Generate file_list.txt with all HTML files +echo "Generating list of HTML files..." +find "$TEMP_DIR" -type f -name "*.html" > "$TEMP_DIR/file_list.txt" + +# Check if file_list.txt was generated successfully +if [ ! -s "$TEMP_DIR/file_list.txt" ]; then + echo "Error: No HTML files found." + rm -rf "$TEMP_DIR" + exit 1 +fi + +echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files." + +# Array to store generated PDF files +pdf_files=() + +# Convert each HTML file to PDF +while IFS= read -r html_file; do + # Get the directory containing the HTML file + html_dir=$(dirname "$html_file") + + # Check if the HTML file exists + if [ ! -f "$html_file" ]; then + echo "Warning: $html_file not found, skipping." + continue + fi + + # Define the output PDF for each HTML file + pdf_file="${html_file%.html}.pdf" + + # Convert HTML to PDF using weasyprint + echo "Converting $html_file to PDF..." + weasyprint \ + --base-url "file://$html_dir/" \ + --media-type print \ + "$html_file" "$pdf_file" + + if [ $? -ne 0 ]; then + echo "Warning: Failed to convert $html_file to PDF, skipping." + continue + fi + + # Add the PDF to the array + pdf_files+=("$pdf_file") +done < "$TEMP_DIR/file_list.txt" + +# Check if any PDFs were generated +if [ ${#pdf_files[@]} -eq 0 ]; then + echo "Error: No PDFs were generated." + rm -rf "$TEMP_DIR" + exit 1 +fi + +# Merge the PDFs into a single file +echo "Merging PDFs into $OUTPUT_PDF..." +pdfunite "${pdf_files[@]}" "$OUTPUT_PDF" + +if [ $? -eq 0 ]; then + echo "Successfully created $OUTPUT_PDF" +else + echo "Error merging PDFs." + rm -rf "$TEMP_DIR" + exit 1 +fi + +# Clean up temporary directory +rm -rf "$TEMP_DIR" +echo "Cleaned up temporary files."