#!/bin/sh # # Parse a cgit-style repo index provided on stdin # if [ $# -lt 2 ]; then printf "Usage: %s []\n" $0 exit 1 fi FIN="/dev/stdin" URLBASE="$1" DEST="$2" SUBPATH="/plain" CURL="torify curl -Ls " TMPFILE="./tmp_$$" READMES="README README.txt README.md readme readme.txt readme.md" if [ $# -gt 2 ]; then REPOURL="$3" else REPOURL="$URLBASE" fi ## func cleanup() { rm -f $TMPFILE exit } ## func get_repos() { repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \ | awk -F'\t' '{print $3,$4}' \ | sed -E 's/href=//g;s/ /\|/'\ ) } ## func get_descr() { reponame=$1 url=$2 echo "reponame: $reponame" echo "url: $url" ## Get author and description $CURL "$url" | xml2tsv | grep -Ei "/html/body/div/table/tr/td[[:blank:]]+class=sub" | \ ## cat "$TMPFILE" | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+href=$link" |\ tail -2 | cut -f 3- } ## func get_readmes() { LINK=$1 DESTDIR=$2 for f in $READMES; do printf " trying file %s..." $LINK/$f $CURL "$LINK/$f" > $DESTDIR/$f failure=$(xml2tsv < $DESTDIR/$f 2>/dev/null | \ grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found") echo $failure if [ "$failure" != 0 ]; then rm -f $DESTDIR/$f printf "[FAILED]\n" else printf "[OK]\n" fi sleep 1 done } PROTO=${URLBASE%%:\/\/*} DIRBASE="$PROTO/${REPOURL##[a-z]*:\/\/}" echo "proto: $PROTO" echo "dirbase: $DIRBASE" trap cleanup EXIT KILL TERM INT $CURL "${URLBASE}" | xml2tsv > $TMPFILE get_repos for r in $repos; do printf "Retrieving repo %s...\n" $repo link=$(echo "$r" | cut -d "|" -f 1 ) name=$(echo "$r" | cut -d "|" -f 2 ) baselink=$(printf "%s%s" $REPOURL $link) REPODIR="$DEST/$DIRBASE/$link/" mkdir -p $REPODIR get_descr "$link" "$baselink" > ${REPODIR}/DESCR ## Get READMEs get_readmes "$baselink/$SUBPATH" "$REPODIR" done cleanup ### the readme file is at REPLINK/plain/README ### if not found, look for "/html/body/div/div/div class=error Not found"