#!/usr/bin/awk -f ################################################################################ # # # pyllyukko maimed [dot] org -- 2005 (i think...) # # # ################################################################################ function add_link(url) { for(i=0; i verbose_output if(url !~ /^[a-zA-Z]+:\/\/[^/]+\/.*$/) { printf " url is not absolute!\n" > "/dev/stderr" return 1 } host = gensub(/^[a-zA-Z]+:\/\/([^/]+).*$/, "\\1", 1, url) domain = gensub(/^.*\.([^.]+\.[^.]+)$/, "\\1", 1, host) request_uri = gensub(/^[a-zA-Z]+:\/\/[^/]+(.*)$/, "\\1", 1, url) if(request_uri ~ /\/$/) dir = request_uri else dir = gensub(/^(.*\/)[^/]+$/, "\\1", 1, request_uri) # TODO: SCHEME VARIABLE printf " url:\t\t\t%s\n", url > verbose_output printf " host:\t\t\t%s\n", host > verbose_output printf " uri:\t\t\t%s\n", request_uri > verbose_output printf " dir:\t\t\t%s\n", dir > verbose_output printf " domain:\t\t%s\n", domain > verbose_output Service = "/inet/tcp/0/"host"/80" ORS = RS = "\r\n\r\n" #http_request = "GET " request_uri " HTTP/1.0\r\nHost: " host #printf "->%s<-\n", http_request print "GET "request_uri" HTTP/1.0\r\nHost: "host |& Service Service |& getline http_header bytes_received = length(http_header)+4 http_response = gensub(/^HTTP\/[0-9]+\.[0-9]+[ \t]+([0-9]+).*$/, "\\1", 1, http_header) http_codes[http_response]++ printf " http response:\t%d\n", http_response > verbose_output if(http_header ~ /Set-Cookie/) { #cookie = gensub(/^.*Set-Cookie: (.+);.*$/, "\\1", 1, http_header) cookie = gensub(/^.*Set-Cookie: ([^;]+);.*$/, "\\1", 1, http_header) printf " cookie:\t\t%s\n", cookie > verbose_output } # TODO: 302 || 301 if(http_response==302) { location = gensub(/^.*Location: ([^\r\n]+).*$/, "\\1", 1, http_header) if(location ~ /^https?:\/\/[^/]+\/.*$/) { link = location } else if(location ~ /^https?:\/\/[^/]+$/) { link = location"/" } else if(location ~ /^\//) { link = "http://"host location } else { link = "http://"host dir location } printf " -> %s\n", link > verbose_output all_links[link]++ add_link(link) } RS = "\n" # ABSOLUTE & RELATIVE URLS, FRAGMENT IDENTIFIER while ((Service |& getline)>0) { bytes_received+=length($0)+1 # TODO: CHECK ALSO IMG's? if($0 ~ /href/) { do { match($0, /href="[^"]+/) if(RSTART==0&&RLENGTH==-1) break link = substr($0, RSTART+6, RLENGTH-6) if(link ~ /^https?:\/\/[^/]+\/.*$/) { # ABSOLUTE if(link ~ domain) { if(link ~ host) printf " absolute (host) INTERNAL: %s\n", link > verbose_output else printf " absolute (domain) INTERNAL: %s\n", link > verbose_output } else { printf " absolute EXTERNAL: %s\n", link > verbose_output } } else if(link ~ /^https?:\/\/[^/]+$/) { # BROKEN ABSOLUTE printf " invalid absolute: %s\n -> %s\n", link, link = link"/" > verbose_output } else if(link ~ /^mailto:/) { # MAILTO printf " e-mail: %s\n", substr(link, 8) > verbose_output } else if (link ~ /^[a-z]+:/) { # ALL UNRECOGNIZED SCHEMES printf " unrecognized scheme: %s!\n", gensub(/^([^:]+).*$/, "\\1", 1, link) > verbose_output } else { # RELATIVE # TODO: DISTINCT SCHEME! printf " relative: %s\n", link > verbose_output if(link ~ /^\//) { link = "http://"host link } else if(link ~ /^\.\//) { sub(/^\.\//, "", link) link = "http://"host dir link } else if(link ~ /\.\.\//) { # ^? printf " directory traversal, how convenient!\n" > verbose_output link = "http://"host dir link } else { link = "http://"host dir link } printf " -> %s\n", link > verbose_output } all_links[link]++ if(link ~ /\?/) { split(link, name_me_properly, "?") sub(/^[A-Za-z]+:\/\/[^/]+/, "", name_me_properly[1]) printf " resource path: %s\n", name_me_properly[1] > verbose_output printf " query string: ?%s\n", name_me_properly[2] > verbose_output } if(link ~ /#/) { split(link, name_me_properly, "#") printf " fragment identifier: %s\n", name_me_properly[2] > verbose_output } # TODO: .PL? if(link ~ domain&& link ~ /^http/&& (link ~ /html$/|| #link ~ /\.pl$/|| link ~ /\/$/|| link ~ /\?/)) { add_link(link) # TODO: BREAK DOWN DIRECTORIES } $0 = substr($0, RSTART+RLENGTH) } while(1) } # if($0 ~ /href/) } # while ((Service |& getline)>0) close(Service) printf " %d bytes received\n", bytes_received return 0 } # get_hrefs() BEGIN { seconds["start"] = systime() verbose_output = "/dev/stdout" #verbose_output = "/dev/null" IGNORECASE = 1 printf "itsy bitsy spider /\\oo/\\\n ~~\n" if(ARGC!=2) { printf "usage: [insert program name here] \n" > "/dev/stderr" exit 1 } queue[k = 0] = ARGV[1] list_items = 1 # LOOP-DE-LOOP while(k in queue) { get_hrefs(queue[k]) k++ } printf "http responses received:\n code\tcount\n" for(i in http_codes) printf " %d\t%d\n", i, http_codes[i] n = asorti(all_links) printf "found the following urls:\n" for (i=1; i<=n; i++) { if(all_links[i] ~ domain) printf " %s: %s\n", i, all_links[i] } seconds["stop"] = systime() seconds["total"] = seconds["stop"]-seconds["start"] if(seconds["total"]==0) printf "crawled through %d uri's in ~%d seconds!\n", list_items, seconds["total"] else printf "crawled through %d uri's in ~%d second(s),\nthat's like %.2f in a second or something.\n", list_items, seconds["total"], list_items/seconds["total"] exit 0 }