diff options
Diffstat (limited to 'article2pdf')
-rwxr-xr-x | article2pdf | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/article2pdf b/article2pdf new file mode 100755 index 0000000..4259ee4 --- /dev/null +++ b/article2pdf @@ -0,0 +1,28 @@ +#!/bin/bash + +# Downloads an article from the web and converts it to PDF with pandoc. +# USAGE: article2pdf <url> + +pdfs_dir=~/Downloads/articles + +html_file=`mktemp` + +# Some sites ban curl, so use "firefox" user agent +user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0' +curl --user-agent "$user_agent" --location --silent "$1" > "$html_file" + +pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \ + | grep -v '^\s*$' \ + | head -n 1 \ + | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf + + +if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then + exit 0 +else + # we may have failed due to unconvertable images (e.g. webp) + # in that case try rebuilding without images + pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file" +fi + +rm "$html_file" |