From bc3a1f13fd5fddb9fa8f3ecff6144b512ed9d08b Mon Sep 17 00:00:00 2001 From: KatolaZ Date: Wed, 8 Jan 2020 07:45:14 +0000 Subject: initial commit --- parse_cgit | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ parse_stagit | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ search_repo | 24 +++++++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100755 parse_cgit create mode 100755 parse_stagit create mode 100755 search_repo diff --git a/parse_cgit b/parse_cgit new file mode 100755 index 0000000..34029fe --- /dev/null +++ b/parse_cgit @@ -0,0 +1,59 @@ +#!/bin/sh + +# +# Parse a cgit-style repo index provided on stdin +# + +if [ $# -lt 2 ]; then + printf "Usage: %s \n" $0 + exit 1 +fi + +FIN="/dev/stdin" +URLBASE="$1" +DEST="$2" + +SUBPATH="/plain" + +CURL="torify curl -Ls " + +PROTO=${URLBASE%%:\/\/*} +DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" +echo "proto: $PROTO" +echo "dirbase: $DIRBASE" + +READMES="README README.txt README.md readme readme.txt readme.md" + + +repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \ + | awk -F'\t' '{print $3,$4}' \ + | sed -E 's/href=//g;s/ /\|/'\ + ) + + +for r in $repos; do + printf "Retrieving repo %s...\n" $repo + link=$(echo "$r" | cut -d "|" -f 1 ) + name=$(echo "$r" | cut -d "|" -f 2 ) + baselink=$(printf "%s%s" $URLBASE $link) + REPODIR="$DEST/$DIRBASE/$link/" + mkdir -p $REPODIR + for f in $READMES; do + printf " trying file %s..." $baselink/$SUBPATH/$f + torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f + failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \ + grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found") + echo $failure + if [ "$failure" != 0 ]; then + rm -f $REPODIR/$f + printf "[FAILED]\n" + else + printf "[OK]\n" + fi + #sleep 1 + done +done + +### the readme file is at REPLINK/plain/README + +### if not found, look for "/html/body/div/div/div class=error Not found" diff --git a/parse_stagit b/parse_stagit new file mode 100755 index 0000000..39feda2 --- /dev/null +++ b/parse_stagit @@ -0,0 +1,62 @@ +#!/bin/sh + +# +# Parse stagit-style repos +# + +if [ $# -lt 2 ]; then + printf "Usage: %s \n" $0 + exit 1 +fi + +FIN="/dev/stdin" +URLBASE="$1" +DEST="$2" + +SUBPATH="/file" + +CURL="torify curl -Ls " + +PROTO=${URLBASE%%:\/\/*} +DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}" +echo "proto: $PROTO" +echo "dirbase: $DIRBASE" + +READMES="README.html README.txt.html README.md.html readme.html readme.txt.html readme.md.html" + + +repos=$($CURL "${URLBASE}" | xml2tsv | grep "/html/body/div/table/tbody/tr/td/a" \ + | awk '{print $(NF-1), $NF}' \ + | sed -E 's/href=//g;s/ /\|/'\ + ) + + +for r in $repos; do + name=$(echo "$r" | cut -d "|" -f 2 ) + link=$(echo "$r" | cut -d "|" -f 1 ) + link="${link%%log.html}" + baselink=$(printf "%s/%s" $URLBASE $link) + printf "link: %s\nbaselink: %s\n" $link $baselink 1>&2 + REPODIR="$DEST/$DIRBASE/$link/" + mkdir -p $REPODIR + for f in $READMES; do + printf " trying file %s..." $baselink/$SUBPATH/$f + $CURL "$baselink/$SUBPATH/$f" > $REPODIR/$f.tmp + failure=$(xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \ + grep -Eaic "^/html/head/title[[:blank:]]+404 Not Found") + echo $failure + if [ "$failure" = 1 ]; then + printf "[FAILED]\n" + else + xml2tsv < $REPODIR/$f.tmp 2>/dev/null | \ + grep -Eai "/html/body/div/pre/a[[:blank:]]+href=#.*[[:blank:]]+class=line" | \ + cut -f 6- | \ + sed -E 's/\\n//g;s/\\t/\t/g;s/\\\\/\\/g' > $REPODIR/$f + printf "[OK]\n" + fi + rm -f $REPODIR/$f.tmp + sleep 1 + done +done + + diff --git a/search_repo b/search_repo new file mode 100755 index 0000000..65dd9c5 --- /dev/null +++ b/search_repo @@ -0,0 +1,24 @@ +#!/bin/sh + +# +# Search a set of repos for specific words and return a list of URLS +# to the matching repos +# + +if [ $# -lt 2 ]; then + printf "Usage: %s [...]\n" $0 + exit 1 +fi + +FOLDER="$1" +shift +WORDS="$@" + +query=$(echo "$WORDS" | sed -E 's/\ /\|/g') + +cd "$FOLDER" + +res=$(grep -Eric "$query" | grep -v ":0$" | sort -t ':' -rnk2 | \ + sed -E 's/([a-z]+)\//\1:\/\//1;s/\/[^\/]*$//' ) + +echo "$res" | grep -Ei "^[a-z]+://" -- cgit v1.2.3