blob: 4259ee4f119ae3fa2de8aef8e0bd36a51c22486a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
#!/bin/bash
# Downloads an article from the web and converts it to PDF with pandoc.
# USAGE: article2pdf <url>
pdfs_dir=~/Downloads/articles
html_file=`mktemp`
# Some sites ban curl, so use "firefox" user agent
user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'
curl --user-agent "$user_agent" --location --silent "$1" > "$html_file"
pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \
| grep -v '^\s*$' \
| head -n 1 \
| sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf
if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then
exit 0
else
# we may have failed due to unconvertable images (e.g. webp)
# in that case try rebuilding without images
pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file"
fi
rm "$html_file"
|