init
This commit is contained in:
130
crawl.sh
Executable file
130
crawl.sh
Executable file
@@ -0,0 +1,130 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Usage: ./crawl_to_pdf.sh <start_url> <max_depth> <output_pdf>
|
||||||
|
# Example: ./crawl_to_pdf.sh https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf
|
||||||
|
|
||||||
|
# Check if required arguments are provided
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "Usage: $0 <start_url> <max_depth> <output_pdf>"
|
||||||
|
echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
START_URL="$1"
|
||||||
|
MAX_DEPTH="$2"
|
||||||
|
OUTPUT_PDF="$3"
|
||||||
|
|
||||||
|
# Validate max_depth is a positive integer
|
||||||
|
if ! [[ "$MAX_DEPTH" =~ ^[0-9]+$ ]] || [ "$MAX_DEPTH" -lt 1 ]; then
|
||||||
|
echo "Error: max_depth must be a positive integer."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if wget, weasyprint, and pdfunite are installed
|
||||||
|
if ! command -v wget &> /dev/null; then
|
||||||
|
echo "Error: wget is not installed."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v weasyprint &> /dev/null; then
|
||||||
|
echo "Error: weasyprint is not installed."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v pdfunite &> /dev/null; then
|
||||||
|
echo "Error: pdfunite is not installed."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create a temporary directory for downloads
|
||||||
|
TEMP_DIR=$(mktemp -d)
|
||||||
|
echo "Working in temporary directory: $TEMP_DIR"
|
||||||
|
|
||||||
|
# Extract domain from URL to restrict crawling
|
||||||
|
DOMAIN=$(echo "$START_URL" | grep -oP '(?<=://)[^/]+')
|
||||||
|
|
||||||
|
# Crawl the website using wget
|
||||||
|
echo "Crawling $START_URL (max depth: $MAX_DEPTH)..."
|
||||||
|
wget \
|
||||||
|
--recursive \
|
||||||
|
--level="$MAX_DEPTH" \
|
||||||
|
--convert-links \
|
||||||
|
--html-extension \
|
||||||
|
--no-parent \
|
||||||
|
--domains="$DOMAIN" \
|
||||||
|
--no-verbose \
|
||||||
|
--directory-prefix="$TEMP_DIR" \
|
||||||
|
"$START_URL"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Error: Failed to crawl $START_URL."
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate file_list.txt with all HTML files
|
||||||
|
echo "Generating list of HTML files..."
|
||||||
|
find "$TEMP_DIR" -type f -name "*.html" > "$TEMP_DIR/file_list.txt"
|
||||||
|
|
||||||
|
# Check if file_list.txt was generated successfully
|
||||||
|
if [ ! -s "$TEMP_DIR/file_list.txt" ]; then
|
||||||
|
echo "Error: No HTML files found."
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files."
|
||||||
|
|
||||||
|
# Array to store generated PDF files
|
||||||
|
pdf_files=()
|
||||||
|
|
||||||
|
# Convert each HTML file to PDF
|
||||||
|
while IFS= read -r html_file; do
|
||||||
|
# Get the directory containing the HTML file
|
||||||
|
html_dir=$(dirname "$html_file")
|
||||||
|
|
||||||
|
# Check if the HTML file exists
|
||||||
|
if [ ! -f "$html_file" ]; then
|
||||||
|
echo "Warning: $html_file not found, skipping."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Define the output PDF for each HTML file
|
||||||
|
pdf_file="${html_file%.html}.pdf"
|
||||||
|
|
||||||
|
# Convert HTML to PDF using weasyprint
|
||||||
|
echo "Converting $html_file to PDF..."
|
||||||
|
weasyprint \
|
||||||
|
--base-url "file://$html_dir/" \
|
||||||
|
--media-type print \
|
||||||
|
"$html_file" "$pdf_file"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Warning: Failed to convert $html_file to PDF, skipping."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Add the PDF to the array
|
||||||
|
pdf_files+=("$pdf_file")
|
||||||
|
done < "$TEMP_DIR/file_list.txt"
|
||||||
|
|
||||||
|
# Check if any PDFs were generated
|
||||||
|
if [ ${#pdf_files[@]} -eq 0 ]; then
|
||||||
|
echo "Error: No PDFs were generated."
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Merge the PDFs into a single file
|
||||||
|
echo "Merging PDFs into $OUTPUT_PDF..."
|
||||||
|
pdfunite "${pdf_files[@]}" "$OUTPUT_PDF"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "Successfully created $OUTPUT_PDF"
|
||||||
|
else
|
||||||
|
echo "Error merging PDFs."
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean up temporary directory
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
echo "Cleaned up temporary files."
|
Reference in New Issue
Block a user