blob: dcef4416ff20b7ec7da10efb0b1eea46a988b16c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#!/bin/bash
# Downloads an article from the web and converts it to PDF with pandoc.
# USAGE: article2pdf <url>
pdfs_dir=~/Downloads/articles
html_file=`mktemp`
# Some sites ban curl, so use "firefox" user agent
user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'
curl --user-agent "$user_agent" --location --silent "$1" > "$html_file"
title=`pup --file "$html_file" 'title text{}' \
| grep -v '^\s*$' \
| head -n 1 \
| sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`
echo "title based on html: |$title|"
if [ -z "$title" ]; then
title=${1##*/}
title=${title%.*}
fi
echo "final title: $title"
#pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \
#| grep -v '^\s*$' \
#| head -n 1 \
#| sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf
pdf_file="$pdfs_dir/$title.pdf"
echo "pdf file: $pdf_file"
if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then
exit 0
else
# we may have failed due to unconvertable images (e.g. webp)
# in that case try rebuilding without images
pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file"
fi
rm "$html_file"
|