article2pdf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

#!/bin/bash

# Downloads an article from the web and converts it to PDF with pandoc.
# USAGE: article2pdf <url>

pdfs_dir=~/Downloads/articles

html_file=`mktemp`

# Some sites ban curl, so use "firefox" user agent
user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'
curl --user-agent "$user_agent" --location --silent "$1" > "$html_file"

title=`pup --file "$html_file" 'title text{}' \
    | grep -v '^\s*$' \
    | head -n 1 \
    | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`

echo "title based on html: |$title|"

if [ -z "$title" ]; then
    title=${1##*/}
    title=${title%.*}
fi

echo "final title: $title"

#pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \
    #| grep -v '^\s*$' \
    #| head -n 1 \
    #| sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf

pdf_file="$pdfs_dir/$title.pdf"

echo "pdf file: $pdf_file"

if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then
    exit 0
else
    # we may have failed due to unconvertable images (e.g. webp)
    # in that case try rebuilding without images
    pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file"
fi

rm "$html_file"