init
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
|||||||
*.pdf
|
*.pdf
|
||||||
|
*.log
|
||||||
|
1
README.md
Normal file
1
README.md
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Vibe coded shitty tool to crawl websites into pdf files.
|
219
crawl.sh
219
crawl.sh
@@ -1,28 +1,20 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Usage: ./crawl_to_pdf.sh <start_url> <max_depth> <output_pdf>
|
if [ $# -lt 2 ]; then
|
||||||
# Example: ./crawl_to_pdf.sh https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf
|
echo "Usage: $0 <start_url> <output_pdf> [--keep-temp]"
|
||||||
|
echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page wikipedia.pdf --keep-temp"
|
||||||
# Check if required arguments are provided
|
|
||||||
if [ $# -ne 3 ]; then
|
|
||||||
echo "Usage: $0 <start_url> <max_depth> <output_pdf>"
|
|
||||||
echo "Example: $0 https://en.wikipedia.org/wiki/Main_Page 2 wikipedia.pdf"
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
START_URL="$1"
|
START_URL="$1"
|
||||||
MAX_DEPTH="$2"
|
OUTPUT_PDF="$2"
|
||||||
OUTPUT_PDF="$3"
|
KEEP_TEMP=0
|
||||||
|
if [ "$3" = "--keep-temp" ]; then
|
||||||
# Validate max_depth is a positive integer
|
KEEP_TEMP=1
|
||||||
if ! [[ "$MAX_DEPTH" =~ ^[0-9]+$ ]] || [ "$MAX_DEPTH" -lt 1 ]; then
|
|
||||||
echo "Error: max_depth must be a positive integer."
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check if wget, weasyprint, and pdfunite are installed
|
if ! command -v curl &> /dev/null; then
|
||||||
if ! command -v wget &> /dev/null; then
|
echo "Error: curl is not installed."
|
||||||
echo "Error: wget is not installed."
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if ! command -v weasyprint &> /dev/null; then
|
if ! command -v weasyprint &> /dev/null; then
|
||||||
@@ -34,86 +26,174 @@ if ! command -v pdfunite &> /dev/null; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Create a temporary directory for downloads
|
|
||||||
TEMP_DIR=$(mktemp -d)
|
TEMP_DIR=$(mktemp -d)
|
||||||
echo "Working in temporary directory: $TEMP_DIR"
|
echo "Working in temporary directory: $TEMP_DIR"
|
||||||
|
|
||||||
# Extract domain from URL to restrict crawling
|
DOMAIN=$(echo "$START_URL" | grep -oE '(https?://[^/]+)' | sed 's|https\?://||')
|
||||||
DOMAIN=$(echo "$START_URL" | grep -oP '(?<=://)[^/]+')
|
BASE_URL=$(echo "$START_URL" | grep -oE 'https?://[^/]+')
|
||||||
|
|
||||||
# Crawl the website using wget
|
QUEUE_FILE="$TEMP_DIR/queue.txt"
|
||||||
echo "Crawling $START_URL (max depth: $MAX_DEPTH)..."
|
VISITED_FILE="$TEMP_DIR/visited.txt"
|
||||||
wget \
|
CRAWL_ERROR_LOG="$TEMP_DIR/crawl_errors.log"
|
||||||
--recursive \
|
WEASYPRINT_ERROR_LOG="$TEMP_DIR/weasyprint_errors.log"
|
||||||
--level="$MAX_DEPTH" \
|
echo "$START_URL" > "$QUEUE_FILE"
|
||||||
--convert-links \
|
touch "$VISITED_FILE" "$CRAWL_ERROR_LOG" "$WEASYPRINT_ERROR_LOG"
|
||||||
--html-extension \
|
|
||||||
--no-parent \
|
|
||||||
--domains="$DOMAIN" \
|
|
||||||
--no-verbose \
|
|
||||||
--directory-prefix="$TEMP_DIR" \
|
|
||||||
"$START_URL"
|
|
||||||
|
|
||||||
if [ $? -ne 0 ]; then
|
MAX_DEPTH=2
|
||||||
echo "Error: Failed to crawl $START_URL."
|
MAX_PAGES=100
|
||||||
rm -rf "$TEMP_DIR"
|
PAGE_COUNT=0
|
||||||
exit 1
|
|
||||||
fi
|
mkdir -p "$TEMP_DIR/$DOMAIN"
|
||||||
|
|
||||||
|
normalize_url() {
|
||||||
|
local url="$1"
|
||||||
|
local base="$2"
|
||||||
|
if [[ "$url" =~ ^/ ]]; then
|
||||||
|
url="$BASE_URL$url"
|
||||||
|
elif [[ ! "$url" =~ ^https?:// ]]; then
|
||||||
|
url=$(echo "$url" | sed 's|^\./||')
|
||||||
|
url="$BASE_URL/$(dirname "$base" | sed "s|^$BASE_URL||")/$url"
|
||||||
|
fi
|
||||||
|
url=$(echo "$url" | sed 's/#.*$//; s|/$||; s/\?.*$//')
|
||||||
|
echo "$url"
|
||||||
|
}
|
||||||
|
|
||||||
|
url_to_filepath() {
|
||||||
|
local url="$1"
|
||||||
|
local path=$(echo "$url" | sed "s|^$BASE_URL/||; s|/$||; s|/|_|g")
|
||||||
|
if [ -z "$path" ]; then
|
||||||
|
path="index"
|
||||||
|
fi
|
||||||
|
echo "$TEMP_DIR/$DOMAIN/$path.html"
|
||||||
|
}
|
||||||
|
|
||||||
|
calculate_depth() {
|
||||||
|
local url="$1"
|
||||||
|
local path=$(echo "$url" | sed "s|^https\?://[^/]*||; s|^/||; s|/$||")
|
||||||
|
if [ -z "$path" ]; then
|
||||||
|
echo 0
|
||||||
|
else
|
||||||
|
echo "$path" | awk -F'/' '{print NF}'
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "Crawling $START_URL (links within $DOMAIN, max depth $MAX_DEPTH, max pages $MAX_PAGES)..."
|
||||||
|
while [ -s "$QUEUE_FILE" ] && [ "$PAGE_COUNT" -lt "$MAX_PAGES" ]; do
|
||||||
|
CURRENT_URL=$(head -n 1 "$QUEUE_FILE")
|
||||||
|
sed -i '1d' "$QUEUE_FILE"
|
||||||
|
|
||||||
|
if grep -Fx "$CURRENT_URL" "$VISITED_FILE" >/dev/null; then
|
||||||
|
echo "Skipping already visited: $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$CURRENT_URL" >> "$VISITED_FILE"
|
||||||
|
|
||||||
|
DEPTH=$(calculate_depth "$CURRENT_URL")
|
||||||
|
echo "Processing $CURRENT_URL (depth $DEPTH)" >> "$CRAWL_ERROR_LOG"
|
||||||
|
if [ "$DEPTH" -gt "$MAX_DEPTH" ]; then
|
||||||
|
echo "Skipping (depth $DEPTH > $MAX_DEPTH): $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
FILE_PATH=$(url_to_filepath "$CURRENT_URL")
|
||||||
|
echo "Downloading $CURRENT_URL to $FILE_PATH..."
|
||||||
|
curl -s -L --fail --retry 3 --retry-delay 2 "$CURRENT_URL" > "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Warning: Failed to download $CURRENT_URL" >> "$CRAWL_ERROR_LOG"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
((PAGE_COUNT++))
|
||||||
|
echo "Crawled page $PAGE_COUNT: $CURRENT_URL"
|
||||||
|
|
||||||
|
LINKS=$(grep -o '<a[^>]*href=["'"'"'][^"'"'"']*["'"'"']' "$FILE_PATH" | sed 's/.*href=["'"'"']\([^"'"'"']*\)["'"'"'].*/\1/' | sort -u)
|
||||||
|
for LINK in $LINKS; do
|
||||||
|
if [ -z "$LINK" ] || [[ "$LINK" =~ ^(javascript:|mailto:|#|data:) ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
NORMALIZED_LINK=$(normalize_url "$LINK" "$CURRENT_URL")
|
||||||
|
|
||||||
|
if [[ "$NORMALIZED_LINK" =~ ^https?://$DOMAIN ]]; then
|
||||||
|
LOCAL_PATH=$(url_to_filepath "$NORMALIZED_LINK")
|
||||||
|
RELATIVE_PATH=$(realpath --relative-to="$(dirname "$FILE_PATH")" "$LOCAL_PATH" 2>/dev/null || echo "$LOCAL_PATH")
|
||||||
|
|
||||||
|
ESCAPED_LINK=$(echo "$LINK" | sed 's/[&]/\\&/g')
|
||||||
|
sed -i "s|href=[\"']${ESCAPED_LINK}[\"']|href=\"$RELATIVE_PATH\"|g" "$FILE_PATH" 2>> "$CRAWL_ERROR_LOG"
|
||||||
|
|
||||||
|
if ! grep -Fx "$NORMALIZED_LINK" "$VISITED_FILE" >/dev/null; then
|
||||||
|
echo "$NORMALIZED_LINK" >> "$QUEUE_FILE"
|
||||||
|
echo "Queued: $NORMALIZED_LINK" >> "$CRAWL_ERROR_LOG"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
# Generate file_list.txt with all HTML files
|
|
||||||
echo "Generating list of HTML files..."
|
echo "Generating list of HTML files..."
|
||||||
find "$TEMP_DIR" -type f -name "*.html" > "$TEMP_DIR/file_list.txt"
|
find "$TEMP_DIR" -type f \( -name "*.html" -o -name "*.htm" \) > "$TEMP_DIR/file_list.txt"
|
||||||
|
|
||||||
# Check if file_list.txt was generated successfully
|
|
||||||
if [ ! -s "$TEMP_DIR/file_list.txt" ]; then
|
if [ ! -s "$TEMP_DIR/file_list.txt" ]; then
|
||||||
echo "Error: No HTML files found."
|
echo "Error: No HTML files found. Check crawl_errors.log for details."
|
||||||
rm -rf "$TEMP_DIR"
|
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||||
|
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||||
|
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||||
|
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||||
|
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||||
|
if [ $KEEP_TEMP -eq 0 ]; then
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
else
|
||||||
|
echo "Temporary directory preserved: $TEMP_DIR"
|
||||||
|
fi
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files."
|
echo "Found $(wc -l < "$TEMP_DIR/file_list.txt") HTML files."
|
||||||
|
|
||||||
# Array to store generated PDF files
|
|
||||||
pdf_files=()
|
pdf_files=()
|
||||||
|
|
||||||
# Convert each HTML file to PDF
|
|
||||||
while IFS= read -r html_file; do
|
while IFS= read -r html_file; do
|
||||||
# Get the directory containing the HTML file
|
|
||||||
html_dir=$(dirname "$html_file")
|
html_dir=$(dirname "$html_file")
|
||||||
|
|
||||||
# Check if the HTML file exists
|
|
||||||
if [ ! -f "$html_file" ]; then
|
if [ ! -f "$html_file" ]; then
|
||||||
echo "Warning: $html_file not found, skipping."
|
echo "Warning: $html_file not found, skipping." >> "$WEASYPRINT_ERROR_LOG"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Define the output PDF for each HTML file
|
|
||||||
pdf_file="${html_file%.html}.pdf"
|
pdf_file="${html_file%.html}.pdf"
|
||||||
|
pdf_file="${pdf_file%.htm}.pdf"
|
||||||
|
|
||||||
# Convert HTML to PDF using weasyprint
|
|
||||||
echo "Converting $html_file to PDF..."
|
echo "Converting $html_file to PDF..."
|
||||||
weasyprint \
|
weasyprint \
|
||||||
--base-url "file://$html_dir/" \
|
--base-url "file://$html_dir/" \
|
||||||
--media-type print \
|
--media-type print \
|
||||||
"$html_file" "$pdf_file"
|
"$html_file" "$pdf_file" 2>> "$WEASYPRINT_ERROR_LOG"
|
||||||
|
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "Warning: Failed to convert $html_file to PDF, skipping."
|
echo "Warning: Failed to convert $html_file to PDF, see weasyprint_errors.log for details." >> "$WEASYPRINT_ERROR_LOG"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Add the PDF to the array
|
|
||||||
pdf_files+=("$pdf_file")
|
pdf_files+=("$pdf_file")
|
||||||
done < "$TEMP_DIR/file_list.txt"
|
done < "$TEMP_DIR/file_list.txt"
|
||||||
|
|
||||||
# Check if any PDFs were generated
|
|
||||||
if [ ${#pdf_files[@]} -eq 0 ]; then
|
if [ ${#pdf_files[@]} -eq 0 ]; then
|
||||||
echo "Error: No PDFs were generated."
|
echo "Error: No PDFs were generated. Check weasyprint_errors.log for details."
|
||||||
rm -rf "$TEMP_DIR"
|
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||||
|
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||||
|
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||||
|
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||||
|
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||||
|
if [ $KEEP_TEMP -eq 0 ]; then
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
else
|
||||||
|
echo "Temporary directory preserved: $TEMP_DIR"
|
||||||
|
fi
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Merge the PDFs into a single file
|
echo "Generated ${#pdf_files[@]} PDFs."
|
||||||
|
|
||||||
echo "Merging PDFs into $OUTPUT_PDF..."
|
echo "Merging PDFs into $OUTPUT_PDF..."
|
||||||
pdfunite "${pdf_files[@]}" "$OUTPUT_PDF"
|
pdfunite "${pdf_files[@]}" "$OUTPUT_PDF"
|
||||||
|
|
||||||
@@ -121,10 +201,29 @@ if [ $? -eq 0 ]; then
|
|||||||
echo "Successfully created $OUTPUT_PDF"
|
echo "Successfully created $OUTPUT_PDF"
|
||||||
else
|
else
|
||||||
echo "Error merging PDFs."
|
echo "Error merging PDFs."
|
||||||
rm -rf "$TEMP_DIR"
|
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||||
|
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||||
|
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||||
|
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||||
|
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||||
|
if [ $KEEP_TEMP -eq 0 ]; then
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
else
|
||||||
|
echo "Temporary directory preserved: $TEMP_DIR"
|
||||||
|
fi
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Clean up temporary directory
|
cp "$CRAWL_ERROR_LOG" "./crawl_errors.log"
|
||||||
rm -rf "$TEMP_DIR"
|
cp "$WEASYPRINT_ERROR_LOG" "./weasyprint_errors.log"
|
||||||
echo "Cleaned up temporary files."
|
cp "$QUEUE_FILE" "./queue.log" 2>/dev/null
|
||||||
|
cp "$VISITED_FILE" "./visited.log" 2>/dev/null
|
||||||
|
cp "$TEMP_DIR/file_list.txt" "./file_list.log" 2>/dev/null
|
||||||
|
echo "Debug files saved: crawl_errors.log, weasyprint_errors.log, queue.log, visited.log, file_list.log"
|
||||||
|
|
||||||
|
if [ $KEEP_TEMP -eq 0 ]; then
|
||||||
|
rm -rf "$TEMP_DIR"
|
||||||
|
echo "Cleaned up temporary files."
|
||||||
|
else
|
||||||
|
echo "Temporary directory preserved: $TEMP_DIR"
|
||||||
|
fi
|
||||||
|
Reference in New Issue
Block a user