#!/bin/bash # Downloads an article from the web and converts it to PDF with pandoc. # USAGE: article2pdf pdfs_dir=~/Downloads/articles html_file=`mktemp` # Some sites ban curl, so use "firefox" user agent user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0' curl --user-agent "$user_agent" --location --silent "$1" > "$html_file" pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \ | grep -v '^\s*$' \ | head -n 1 \ | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then exit 0 else # we may have failed due to unconvertable images (e.g. webp) # in that case try rebuilding without images pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file" fi rm "$html_file"