article2pdf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

#!/bin/bash

# Downloads an article from the web and converts it to PDF with pandoc.
# USAGE: article2pdf <url>

pdfs_dir=~/Downloads/articles

html_file=`mktemp`

# Some sites ban curl, so use "firefox" user agent
user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'
curl --user-agent "$user_agent" --location --silent "$1" > "$html_file"

pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \
    | grep -v '^\s*$' \
    | head -n 1 \
    | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf


if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then
    exit 0
else
    # we may have failed due to unconvertable images (e.g. webp)
    # in that case try rebuilding without images
    pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file"
fi

rm "$html_file"