### -*-awk-*- ### ==================================================================== ### @Awk-file{ ### author = "Nelson H. F. Beebe", ### version = "0.12", ### date = "19 November 2012", ### time = "17:53:59 MST", ### filename = "indextohtml.awk", ### address = "University of Utah ### Department of Mathematics, 110 LCB ### 155 S 1400 E RM 233 ### Salt Lake City, UT 84112-0090 ### USA", ### telephone = "+1 801 581 5254", ### FAX = "+1 801 581 4148", ### checksum = "45496 531 1913 15703", ### email = "beebe@math.utah.edu, beebe@acm.org, ### beebe@computer.org (Internet)", ### codetable = "ISO/ASCII", ### keywords = "HTML, index, prettyprinter, SGML, WWW, ### World-Wide Web", ### license = "public domain", ### supported = "yes", ### docstring = "Convert an FTP tree index file to HTML, ### for the convenience of World-Wide Web users ### who access the FTP tree with a WWW client ### program, such as arena, hotjava, netscape, or ### xmosaic. These index files provide a simple ### way to add descriptive text to file names. ### ### Usage: ### nawk -f indextohtml.awk `pwd`/index >index.html ### or ### nawk -f indextohtml.awk `pwd`/index | \ ### nawk -f html-pretty.awk >index.html ### ### GNU gawk may be used in place of nawk. ### ### An optional ### -v FILE=name ### may be provided on the nawk command lines to ### set the input file name, to override the ### actual name, or in the event that input is ### from standard input. ### ### An optional ### -v HEADER=n ### may be provided on the command line to ### request insertion of HTML headers

### letter at the start of each group of ### description lines that begin with a new ### initial letter. This can help to make a ### large index more readable. ### ### FTP index files are simple text files of the ### form ### ### % Comment ### % Another comment ### ### Introductory text with no embedded tabs ### or leading percents. ### ### filenamedescription ### filename ### filenamedescription ### more description ### ### Lines that begin with one or more percents ### are formatted for verbatim display, with the ### leading percent(s), and exactly one following ### space (if present), removed. ### ### Each filename is embedded in an HREF ### hypertext link, making it easy for a human ### reader to view that file. Each time the ### basename of the filename changes, a NAME ### hypertext link is generated for the basename, ### allowing direct reference to that section of ### the file using a #basename suffix on the URL ### that points to the output file; this is ### convenient for use in other HTML files. ### ### Embedded tab characters are the key to ### differentiating the introductory text from ### the item descriptions. Once a line ### containing a tab is met, the remainder of the ### file is treated as the filename description ### section, with the exception of verbatim ### listings. ### ### There is also special support for index files ### of another format; they begin with the line ### ### # Alphabetical file listing ### ### and are simply converted to a verbatim ### listing. These are present in some FTP ### directories where the maintainer has not yet ### had time to create a customized index file. ### ### A modest attempt is made to prettyprint the ### output, but for best results, filter the ### output with html-pretty.awk. ### ### Although this program will read standard ### input if no filename is provided, it is ### better to use full path names on the input ### file, since the file directory and name are ### used in a comment header. ### ### The checksum field above contains a CRC-16 ### checksum as the first value, followed by the ### equivalent of the standard UNIX wc (word ### count) utility output of lines, words, and ### characters. This is produced by Robert ### Solovay's checksum utility.", ### } ### ==================================================================== BEGIN { VERSION = "0.12" # these MUST match version DATE = "[19-Nov-2012]" # and date above initialize() html_banner() html_header() } /^# Alphabetical file listing/ { verbatim_file(); next } /^%/ { verbatim_line(); next } /\t/ { description_line(); next } { introduction_line() ; next } END { html_trailer() } function basename(pathname) { # Remove any compression (.gz, .Z, .z) suffix from a file # pathname, then return the basename, which is the remainder after # removal of any dotted suffix. For example: # foo -> foo # foo. -> foo # foo.tar -> foo # foo.tar.Z -> foo # foo.tar.gz -> foo # foo.tar.gz -> foo # foo.tar.z -> foo # foo-1.23.1.tar -> foo-1.23.1 # First remove any compression suffix: if (match(pathname,/[.][zZ]$/) || match(pathname,/[.]gz$/)) pathname = substr(pathname,1,RSTART-1) if (match(pathname,/[.][^.]*$/)) # then reduce foo.xyz to foo return (substr(pathname,1,RSTART-1)) else # no suffix: foo -> foo return pathname } function begin_description_item( initial_letter,k,name,suffix) { initial_letter = substr(basename($1),1,1) if (HEADER > 0) { if (HEADER > 6) HEADER = 6 if (tolower(initial_letter) != tolower(substr(last_basename,1,1))) { if (in_description_list) end_description_list() print prefix(2) "

" print prefix(2) "" if (!(initial_letter in Packages)) { print prefix(3) "" print prefix(4) initial_letter print prefix(3) "" Packages[initial_letter] = initial_letter } print prefix(2) "" begin_description_list() } } print prefix(3) "

" name = basename($1) if ((name != last_basename) && (!(name in Packages))) { last_basename = name Packages[last_basename] = $1 print prefix(4) "" } else print prefix(4) "" print prefix(5) $1 print prefix(4) "" name = package_name(last_basename) if ((name != "") && !(name in Packages)) { # Output a NAME anchor for this new package. Since nested # anchors are illegal, we need a separate anchor, but we have # no real contents for it. Some browsers fail to find the # name if its contents are empty, or spaces, or a comment, but # it appears that a non-breakable space does the trick. Since # not all browsers recognize , we have to use the # equivalent numeric entity. print prefix(4) "" print prefix(5) " " print prefix(4) "" Packages[name] = $1 } print prefix(3) "

" print prefix(3) "

" if (NF > 1) printf("%s", prefix(4)) for (k = 2; k <= NF; ++k) printf("%s%s", $k, (k == NF) ? "\n" : " ") in_description_item = 1 } function begin_description_list() { if (HEADER == 0) print prefix(2) "

" print prefix(2) "

" in_description_list = 1 } function begin_verbatim() { print prefix(2) "

"
    in_verbatim = 1
}

function description_line()
{
    if (in_verbatim)
	end_verbatim()
    if (!in_description_list)
	begin_description_list()
    make_escape_sequences()
    if ($0 ~ /^[-a-zA-Z0-9]/)
    {
	end_description_item()
	begin_description_item()
    }
    else
    {
	gsub(/^[ \t]+/, "", $0)
	if ($0 != "")		# discard empty lines
	    print prefix(5) $0
    }
}

function end_description_item()
{
    if (in_description_item)
    {
	print prefix(3) "

" in_description_item = 0 } } function end_description_list() { end_description_item() print prefix(2) "" in_description_list = 0 } function end_verbatim() { print prefix(2) "" in_verbatim = 0 } function html_banner() { print "" # for GNU Emacs mode selection if (FILE != "") print "" print "" print "" print "" print "" if (FILE != "") print "" print "" } function html_header() { directory = FILE gsub(/\/[^\/]+$/,"",directory) # trim filename, if any, to get directory if (directory == "") directory = "/" # but ensure that we at least a root directory # Drop common leading path names to account for ftpd's chroot sub(/^\/var\/spool\/ftp\//, "/", directory) sub(/^\/usr\/spool\/ftp\//, "/", directory) sub(/^\/u\/ftp\//, "/", directory) print prefix(0) "\n" print prefix(0) "" print prefix(1) "" print prefix(2) "" print prefix(2) "" print prefix(3) "Index of files in " directory print prefix(2) "" print prefix(2) "" print prefix(1) "" print prefix(1) "" print prefix(2) "

" print prefix(3) "Index of files in " directory print prefix(2) "

" } function html_trailer() { if (in_description_list) end_description_list() if (in_verbatim) end_verbatim() print prefix(2) "

" print prefix(1) "" print prefix(0) "" } function initialize( cmd) { if (FILE == "") FILE = FILENAME if (FILE == "-") FILE = "" if (INDENT == "") INDENT = 4 in_description_list = 0 in_paragraph = 0 in_verbatim = 0 cmd = "awk '/^ *domain */ {print $2}' /etc/resolv.conf" cmd | getline the_host close(cmd) if (the_host == "") { cmd = "uname -n" cmd | getline the_host close(cmd) } if (the_host == "") { cmd = "hostname" cmd | getline the_host close(cmd) } "date" | getline the_time "echo $LOGNAME" | getline the_user "cat /etc/passwd | awk -F: '/^" the_user ":/ {print $5}' | head -1" | \ getline the_person if (the_person == "") "ypcat passwd | awk -F: '/^" the_user ":/ {print $5}' | head -1" | \ getline the_person } function introduction_line() { if (in_verbatim) end_verbatim() if (in_description_list) # handle case of bare filename with no description_line() # following description else { make_escape_sequences() if (in_paragraph) { if ($0 ~ /^[ \t]*$/) { print prefix(2) "

" print "" in_paragraph = 0 } else print prefix(3) $0 } else { print prefix(2) "

" if ($0 ~ /^[ \t]*$/) print "" else print prefix(3) $0 in_paragraph = 1 } } } function make_anchors() { $0 = make_email_helper(make_anchors_helper($0)) } function make_anchors_helper(s) { # See Internet RCS 1630 (June 1994) for details of # URL syntax. if (match(s,/[A-Za-z]+:\/\/[^{}\[\]\\^~<>`\" ]+/)) s = substr(s,1,RSTART-1) \ "" \ substr(s,RSTART,RLENGTH) \ "" \ make_anchors_helper(substr(s,RSTART+RLENGTH)) return (s) } function make_email_helper(s) { if (match(s,/[A-Za-z][A-Za-z0-9-]+@[A-Za-z][A-Za-z0-9-]*([.][A-Za-z][A-Za-z0-9-]*)+/)) s = substr(s,1,RSTART-1) \ "" \ substr(s,RSTART,RLENGTH) \ "" \ make_email_helper(substr(s,RSTART+RLENGTH)) return (s) } function make_escape_sequences() { gsub(/&/,"\\&") gsub(//,"\\>") gsub(/"/,"\\"") } function package_name(name) { # Convert distribution file basenames like foo-1.23.1 to the package # name foo, by stripping suffixes of hyphen or underscore followed by # digits, hyphens, underscores, and dots. sub(/[-_][-_0-9.]+$/,"",name) return (name) } function prefix(level) { # Return a prefix of up to 60 blanks return (substr(" ", 1, INDENT * level)) } function verbatim_file( hostname) { hostname = "" begin_verbatim() print $0 while (getline > 0) { make_escape_sequences() make_anchors() # If we can match a line like this: # # Host: ftp.math.utah.edu # we can convert it to a hypertext link if ((NF == 3) && ($0 ~ /\# Host:/)) { if (match($0, $3 "$") > 0) { hostname = $3 print substr($0,1,RSTART-1) "" $3 "" } else print $0 } # If we can match a line like this: # # Directory: /pub/tex/pub/tgrind # we can convert it to a hypertext link else if ((NF == 3) && ($0 ~ /\# Directory:/)) { if ((match($0, $3 "$") > 0) && (hostname != "")) print substr($0,1,RSTART-1) "" $3 "" else print $0 } # A typical file line looks like this: # -rw-r--r-- 0 Nov 11 19:17 00tdir.cmd # -rw-rw-r--+ 1133099 Nov 16 10:40 lanczos-cornelius.html # If we match this pattern, we supply an anchor around the # filename, so that it can be easily selected in a WWW client. else if ((NF == 6) && \ ($1 ~ /^[-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][+]?$/)) { if (match($0, $6 "$") > 0) print substr($0,1,RSTART-1) "" $6 "" else print $0 } else print $0 } end_verbatim() } function verbatim_line() { if (in_description_list) end_description_list() if (!in_verbatim) begin_verbatim() gsub(/^%+/,"") gsub(/^ /,"") make_escape_sequences() make_anchors() print $0 }