summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKatolaZ <katolaz@freaknet.org>2020-01-09 08:09:15 +0000
committerKatolaZ <katolaz@freaknet.org>2020-01-09 08:09:15 +0000
commit5bd5eae1db07ada543c5ecacb6069b539958bd33 (patch)
tree1c17940a9a394feae6fbaea29e5436c743758e72
parentc9dd2610b1d27ed632fdf912e1ba1e093ec373e0 (diff)
refactor parse_cgit
-rwxr-xr-xparse_cgit75
1 files changed, 56 insertions, 19 deletions
diff --git a/parse_cgit b/parse_cgit
index 649a692..d6951af 100755
--- a/parse_cgit
+++ b/parse_cgit
@@ -12,48 +12,85 @@ fi
FIN="/dev/stdin"
URLBASE="$1"
DEST="$2"
-
SUBPATH="/plain"
-
CURL="torify curl -Ls "
-
-PROTO=${URLBASE%%:\/\/*}
-DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
-echo "proto: $PROTO"
-echo "dirbase: $DIRBASE"
-
+TMPFILE="./tmp_$$"
READMES="README README.txt README.md readme readme.txt readme.md"
+## func
+cleanup () {
+ rm -f $TMPFILE
+ exit
+}
+
+## func
+get_repos() {
repos=$(${CURL} "$URLBASE" | xml2tsv | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+title=" \
| awk -F'\t' '{print $3,$4}' \
| sed -E 's/href=//g;s/ /\|/'\
)
+}
+## func
+get_descr() {
+ reponame=$1
+ url=$2
+ echo "reponame: $reponame"
+ echo "url: $url"
+ ## Get author and description
-for r in $repos; do
- printf "Retrieving repo %s...\n" $repo
- link=$(echo "$r" | cut -d "|" -f 1 )
- name=$(echo "$r" | cut -d "|" -f 2 )
- baselink=$(printf "%s%s" $URLBASE $link)
- REPODIR="$DEST/$DIRBASE/$link/"
- mkdir -p $REPODIR
+ $CURL "$url" | xml2tsv | grep -Ei "/html/body/div/table/tr/td[[:blank:]]+class=sub" | \
+## cat "$TMPFILE" | grep -Ei "/html/body/div/div/table/tr/td/a[[:blank:]]+href=$link" |\
+ tail -2 | cut -f 3-
+
+}
+
+## func
+get_readmes() {
+ LINK=$1
+ DESTDIR=$2
for f in $READMES; do
- printf " trying file %s..." $baselink/$SUBPATH/$f
- torify curl -Ls "$baselink/$SUBPATH/$f" > $REPODIR/$f
- failure=$(xml2tsv < $REPODIR/$f 2>/dev/null | \
+ printf " trying file %s..." $LINK/$f
+ $CURL "$LINK/$f" > $DESTDIR/$f
+ failure=$(xml2tsv < $DESTDIR/$f 2>/dev/null | \
grep -Eaic "/html/body/div/div/div[[:blank:]]+class=error[[:blank:]]+Not found")
echo $failure
if [ "$failure" != 0 ]; then
- rm -f $REPODIR/$f
+ rm -f $DESTDIR/$f
printf "[FAILED]\n"
else
printf "[OK]\n"
fi
sleep 1
done
+}
+
+PROTO=${URLBASE%%:\/\/*}
+DIRBASE="$PROTO/${URLBASE##[a-z]*:\/\/}"
+echo "proto: $PROTO"
+echo "dirbase: $DIRBASE"
+
+trap cleanup EXIT KILL TERM INT
+
+$CURL "${URLBASE}" | xml2tsv > $TMPFILE
+
+get_repos
+
+for r in $repos; do
+ printf "Retrieving repo %s...\n" $repo
+ link=$(echo "$r" | cut -d "|" -f 1 )
+ name=$(echo "$r" | cut -d "|" -f 2 )
+ baselink=$(printf "%s%s" $URLBASE $link)
+ REPODIR="$DEST/$DIRBASE/$link/"
+ mkdir -p $REPODIR
+ get_descr "$link" "$baselink" > ${REPODIR}/DESCR
+ ## Get READMEs
+ get_readmes "$baselink/$SUBPATH" "$REPODIR"
done
+clenaup
+
### the readme file is at REPLINK/plain/README
### if not found, look for "/html/body/div/div/div class=error Not found"