summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md29
-rwxr-xr-xburrow23
-rwxr-xr-xurl_to_id57
3 files changed, 104 insertions, 5 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..be73adc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,29 @@
+## Burrow-The-Burrows
+
+A Gopher burrower in a shell script. By using `burrow` and a bit of
+plumbing you can get all the links in a Gopher MENU, recursively visit
+all the available subdirs, and create a directed graph of the visited
+selectors.
+
+`burrow` takes as input a gopher identifier, as generated by
+`url_to_id`, which is considered a gophermap, and provides on stdout the
+list of menu selectors found in that document. `burrow` will also dump
+on stderr the list of all the edges (to any kind of selector) found in
+that page, in the format:
+
+ src_SHA256 dst_SHA256
+
+where `src_SHA256` is the SHA256 of the source selector (the current
+document), while `dst_SHA256` is the destination selector (the pointed
+document).
+
+To start a crawl, one can do something like:
+
+```
+ $ ./url_to_id gopher://your.gopher.url/ > ids
+ $ tail -f ids | parallel -j2 './burrow {}' 2>> graph.txt | tee -a ids >/dev/null &
+```
+
+Notice that `burrow` will create a certain number of folders in the
+current directory, used to keep track of the selectors that have been
+already retrieved.
diff --git a/burrow b/burrow
index 2665d21..8138ca5 100755
--- a/burrow
+++ b/burrow
@@ -7,6 +7,19 @@
###
### where SHA256 is the SHA256SUM of "1|SELECTOR|HOST|PORT"
###
+### *** DRY RUN ***
+###
+### If run as burrow?* (i.e., "burrow" followed by at least one
+### character), burrow will run in DRY MODE, i.e., it will just check
+### if the id provided as input exists, and then exit.
+###
+###------------------------------------------------
+###
+### (C) Vincenzo 'KatolaZ' Nicosia <katolaz@freaknet.org>
+###
+### Use, modify, redistribute under the terms of the GNU General
+### Public License version 3 or, at your option, any other version.
+###
## function
get_dirs(){
src_id="$1"
@@ -56,7 +69,7 @@ retrieve_selector(){
check_selector_present(){
sel_id="$1"
sel_dir="$(echo ${sel_id} | cut -c -2)"
- [ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && exit
+ [ -d "${sel_dir}" -a -f "${sel_dir}/${sel_id}" ] && echo "${SRC}" >>present && exit
## {
## if at least one of the neighbours of sel_id is missing, cat the entire list of
## neighbours to be re-visited and exit
@@ -69,6 +82,8 @@ check_selector_present(){
[ ! -d "${sel_dir}" ] && mkdir -p "${sel_dir}"
}
+[ $# -lt 1 ] && echo "Usage: $0 <gopherlink>" && exit 1
+
SRC="$1"
@@ -78,5 +93,9 @@ check_selector_present "${src_id}"
echo "selector ${src_id} not found" >> logfile.txt
-retrieve_selector "$SRC" | sed -r -e 's/\t/|/g' | get_dirs "${src_id}"
+MYNAME=$(basename $0)
+
+if [ -z "${MYNAME##burrow}" ]; then
+ retrieve_selector "$SRC" | sed -r -e 's/\t/|/g' | get_dirs "${src_id}"
+fi
diff --git a/url_to_id b/url_to_id
index 69cc0fc..4d96442 100755
--- a/url_to_id
+++ b/url_to_id
@@ -1,15 +1,25 @@
#!/bin/sh
-## get a selector in gph format:
+##
+## Get a gopherlink in the format:
+##
+## gopher://domain.org:port/*/my/cool/selector
+##
+## or a selector in gph format:
##
## [TYPE|SEL|HOST|PORT]
##
-## and print on output the corresponding selectorid:
+## and print on output the corresponding "unique" selectorid:
##
## TYPE|SEL|HOST|PORT|SHA256
##
## which is understood by `burrow`
+
+###
+### get a selector in gph format and transform it in a selectorid
+###
+## function
gph_to_id(){
gph="$( echo $1| sed 's/\[//g;s/\]//g')"
OLDIFS=$IFS
@@ -20,4 +30,45 @@ gph_to_id(){
IFS="$OLDIFS"
}
-gph_to_id "$1"
+###
+### Get a gopherurl and transform it in a selectorid
+###
+## function
+gopherurl_to_id(){
+ URL="$(echo $1 | sed 's,gopher://,,g')"
+ hostport=$(echo "$URL" | cut -d "/" -f 1)
+ host="$(echo $hostport | cut -d ":" -f 1)"
+ port="$(echo $hostport | cut -s -d ":" -f 2)"
+ [ -z "$port" ] && port='70'
+ type=$(echo "$URL" | cut -s -d "/" -f 2)
+ [ -z "$type" ] && {
+ type='1'
+ sel="/"
+ gph_to_id "[${type}|${sel}|${host}|$port]"
+ exit 0
+ }
+ [ -n "${type#?}" ] && echo "Invalid Gopher URL" >&2 && exit 1
+ ## Check if type is a valid one
+ type="$(echo $type | sed -n '/^[0-9ITghis+]$/p')"
+ [ -z "${type}" ] && echo "Invalid Gopher URL" >&2 && exit 1
+ sel=/$(echo "$URL" | cut -s -d "/" -f 3-)
+ gph_to_id "[${type}|${sel}|${host}|$port]"
+
+}
+
+
+
+[ $# -lt 1 ] && echo "Usage: $0 <gopherurl>" && echo " $0 <gphselector>" && exit 1
+
+
+[ -n "$(echo $1 | sed -n '/^gopher:\/\//p')" ] && {
+ gopherurl_to_id "$1"
+ exit 0
+}
+
+[ -n "$(echo $1 | sed -n '/^\[.*\]$/p')" ] && {
+ gph_to_id "$1"
+ exit 0
+}
+echo "No valid URL or gph selector provided" >&2
+exit 1