#!/bin/sh # # Parse stagit-style repos # if [ $# -lt 2 ]; then printf "Usage: %s \n" $0 exit 1 fi FIN="/dev/stdin" URLBASE="$1" DEST="$2" SUBPATH="/file" CURL="torify curl -Ls " READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html" TMPFILE="./tmp_$$" ## func cleanup () { rm -f $TMPFILE exit } ## func get_repos() { repos=$(cat $TMPFILE | grep "/html/body/div/table/tbody/tr/td/a" \ | awk '{print $(NF-1), $NF}' \ | sed -E 's/href=//g;s/ /\|/'\ ) } ## func get_descr() { reponame=$1 url=$2 echo "reponame: $reponame" echo "url: $url" ## Get author and description cat "$TMPFILE" | grep -Ei -A 2 "/html/body/div/table/tbody/tr/td/a[[:blank:]]+href=${reponame}/log.html" |\ tail -2 | cut -f 2- } ## func get_readmes() { LINK=$1 DESTDIR=$2 for f in $READMES; do printf " trying file %s..." $LINK/$f $CURL "$LINK/$f" > $DESTDIR/$f.tmp failure=$(xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \ grep -Eaic "^/html/head/title[[:blank:]]+404 Not Found") echo $failure if [ "$failure" = 1 ]; then printf "[FAILED]\n" else xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \ grep -Eai "/html/body/div/pre/a[[:blank:]]+href=#.*[[:blank:]]+class=line" | \ cut -f 6- | \ sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $DESTDIR/$f printf "[OK]\n" fi rm -f $DESTDIR/$f.tmp sleep 1 done } # main loop PROTO=${URLBASE%%:\/\/*} DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" echo "proto: $PROTO" echo "dirbase: $DIRBASE" trap cleanup EXIT KILL TERM INT $CURL "${URLBASE}" | xml2tsv > $TMPFILE get_repos for r in $repos; do name=$(echo "$r" | cut -d "|" -f 2 ) link=$(echo "$r" | cut -d "|" -f 1 ) link="${link%%log.html}" baselink=$(printf "%s/%s" $URLBASE $link) printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2 REPODIR="$DEST/$DIRBASE/$link/" mkdir -p $REPODIR ## get description get_descr "$name" "$baselink" > ${REPODIR}/DESCR ## Get READMEs get_readmes "$baselink/$SUBPATH" "$REPODIR" done cleanup