#!/bin/bash # Downloads an article from the web and converts it to PDF with pandoc. # USAGE: article2pdf pdfs_dir=~/Downloads/articles html_file=`mktemp` # Some sites ban curl, so use "firefox" user agent user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0' curl --user-agent "$user_agent" --location --silent "$1" > "$html_file" title=`pup --file "$html_file" 'title text{}' \ | grep -v '^\s*$' \ | head -n 1 \ | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'` echo "title based on html: |$title|" if [ -z "$title" ]; then title=${1##*/} title=${title%.*} fi echo "final title: $title" #pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \ #| grep -v '^\s*$' \ #| head -n 1 \ #| sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf pdf_file="$pdfs_dir/$title.pdf" echo "pdf file: $pdf_file" if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then exit 0 else # we may have failed due to unconvertable images (e.g. webp) # in that case try rebuilding without images pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file" fi rm "$html_file"