From 355d4996dc32988aabe000fbfaa1a9bbdd9ec585 Mon Sep 17 00:00:00 2001 From: Marcin Chrzanowski Date: Sun, 26 Mar 2023 13:49:36 +0200 Subject: Add initial scripts --- article2pdf | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 article2pdf (limited to 'article2pdf') diff --git a/article2pdf b/article2pdf new file mode 100755 index 0000000..4259ee4 --- /dev/null +++ b/article2pdf @@ -0,0 +1,28 @@ +#!/bin/bash + +# Downloads an article from the web and converts it to PDF with pandoc. +# USAGE: article2pdf + +pdfs_dir=~/Downloads/articles + +html_file=`mktemp` + +# Some sites ban curl, so use "firefox" user agent +user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0' +curl --user-agent "$user_agent" --location --silent "$1" > "$html_file" + +pdf_file=$pdfs_dir/`pup --file "$html_file" 'title text{}' \ + | grep -v '^\s*$' \ + | head -n 1 \ + | sed -e 's/^ *//' -e 's/ *$//' -e 's/ /-/g'`.pdf + + +if pandoc --request-header User-Agent:"$user_agent" "$1" --pdf-engine=xelatex -o "$pdf_file"; then + exit 0 +else + # we may have failed due to unconvertable images (e.g. webp) + # in that case try rebuilding without images + pandoc "$html_file" -f html --pdf-engine=xelatex -o "$pdf_file" +fi + +rm "$html_file" -- cgit v1.2.3