From ddb926830977e7a4a5fe5b91820bdce34c8826e7 Mon Sep 17 00:00:00 2001 From: KatolaZ Date: Wed, 8 Jan 2020 19:56:48 +0000 Subject: refactor parse_stagit -- get author and description --- parse_stagit | 80 +++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/parse_stagit b/parse_stagit index 39feda2..be52ee3 100755 --- a/parse_stagit +++ b/parse_stagit @@ -12,51 +12,87 @@ fi FIN="/dev/stdin" URLBASE="$1" DEST="$2" - SUBPATH="/file" - CURL="torify curl -Ls " -PROTO=${URLBASE%%:\/\/*} -DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" -echo "proto: $PROTO" -echo "dirbase: $DIRBASE" -READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html" +## func +cleanup () { + rm -f $TMPFILE + exit +} +## func +get_repos() { -repos=$($CURL "${URLBASE}" | xml2tsv | grep "/html/body/div/table/tbody/tr/td/a" \ +repos=$(cat $TMPFILE | grep "/html/body/div/table/tbody/tr/td/a" \ | awk '{print $(NF-1), $NF}' \ | sed -E 's/href=//g;s/ /\|/'\ ) +} +## func +get_descr() { + reponame=$1 + echo "reponame: $reponame" + ## Get author and description + cat "$TMPFILE" | grep -Ei -A 2 "/html/body/div/table/tbody/tr/td/a[[:blank:]]+href=${reponame}/log.html" |\ + tail -2 | cut -f 2- + +} -for r in $repos; do - name=$(echo "$r" | cut -d "|" -f 2 ) - link=$(echo "$r" | cut -d "|" -f 1 ) - link="${link%%log.html}" - baselink=$(printf "%s/%s" $URLBASE $link) - printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2 - REPODIR="$DEST/$DIRBASE/$link/" - mkdir -p $REPODIR +## func +get_readmes() { + LINK=$1 + DESTDIR=$2 for f in $READMES; do - printf " trying file %s..." $baselink/$SUBPATH/$f - $CURL "$baselink/$SUBPATH/$f" > $REPODIR/$f.tmp - failure=$(xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \ + printf " trying file %s..." $LINK/$f + $CURL "$LINK/$f" > $DESTDIR/$f.tmp + failure=$(xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \ grep -Eaic "^/html/head/title[[:blank:]]+404 Not Found") echo $failure if [ "$failure" = 1 ]; then printf "[FAILED]\n" else - xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \ + xml2tsv < $DESTDIR/$f.tmp 2>/dev/null | \ grep -Eai "/html/body/div/pre/a[[:blank:]]+href=#.*[[:blank:]]+class=line" | \ cut -f 6- | \ - sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $REPODIR/$f + sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $DESTDIR/$f printf "[OK]\n" fi - rm -f $REPODIR/$f.tmp + rm -f $DESTDIR/$f.tmp sleep 1 done +} + +# main loop + +PROTO=${URLBASE%%:\/\/*} +DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" +echo "proto: $PROTO" +echo "dirbase: $DIRBASE" + +READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html" +TMPFILE="./tmp_$$" + +trap cleanup EXIT KILL TERM INT + +$CURL "${URLBASE}" | xml2tsv > $TMPFILE + +get_repos + +for r in $repos; do + name=$(echo "$r" | cut -d "|" -f 2 ) + link=$(echo "$r" | cut -d "|" -f 1 ) + link="${link%%log.html}" + baselink=$(printf "%s/%s" $URLBASE $link) + printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2 + REPODIR="$DEST/$DIRBASE/$link/" + mkdir -p $REPODIR + get_descr $name > ${REPODIR}/DESCR + ## Get READMEs + get_readmes "$baselink/$SUBPATH" "$REPODIR" done +cleanup -- cgit v1.2.3