m-chrzan.xyz
aboutsummaryrefslogtreecommitdiff
path: root/article2pdf
diff options
context:
space:
mode:
Diffstat (limited to 'article2pdf')
-rwxr-xr-xarticle2pdf28
1 files changed, 28 insertions, 0 deletions
diff --git a/article2pdf b/article2pdf
new file mode 100755
index 0000000..4259ee4
--- /dev/null
+++ b/article2pdf
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Downloads an article from the web and converts it to PDF with pandoc.
+# USAGE: article2pdf <url>
+
+pdfs_dir=~/Downloads/articles
+
+html_file=`mktemp`
+
+# Some sites ban curl, so use "firefox" user agent
+user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0'
+curl --user-agent "$user_agent" --location --silent "$1" > "$html_file"
+
+pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \
+ | grep -v '^\s*$' \
+ | head -n 1 \
+ | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf
+
+
+if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then
+ exit 0
+else
+ # we may have failed due to unconvertable images (e.g. webp)
+ # in that case try rebuilding without images
+ pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file"
+fi
+
+rm "$html_file"