### -*-awk-*-
### ====================================================================
### @Awk-file{
### author = "Nelson H. F. Beebe",
### version = "0.12",
### date = "19 November 2012",
### time = "17:53:59 MST",
### filename = "indextohtml.awk",
### address = "University of Utah
### Department of Mathematics, 110 LCB
### 155 S 1400 E RM 233
### Salt Lake City, UT 84112-0090
### USA",
### telephone = "+1 801 581 5254",
### FAX = "+1 801 581 4148",
### checksum = "45496 531 1913 15703",
### email = "beebe@math.utah.edu, beebe@acm.org,
### beebe@computer.org (Internet)",
### codetable = "ISO/ASCII",
### keywords = "HTML, index, prettyprinter, SGML, WWW,
### World-Wide Web",
### license = "public domain",
### supported = "yes",
### docstring = "Convert an FTP tree index file to HTML,
### for the convenience of World-Wide Web users
### who access the FTP tree with a WWW client
### program, such as arena, hotjava, netscape, or
### xmosaic. These index files provide a simple
### way to add descriptive text to file names.
###
### Usage:
### nawk -f indextohtml.awk `pwd`/index >index.html
### or
### nawk -f indextohtml.awk `pwd`/index | \
### nawk -f html-pretty.awk >index.html
###
### GNU gawk may be used in place of nawk.
###
### An optional
### -v FILE=name
### may be provided on the nawk command lines to
### set the input file name, to override the
### actual name, or in the event that input is
### from standard input.
###
### An optional
### -v HEADER=n
### may be provided on the command line to
### request insertion of HTML headers
### letter at the start of each group of
### description lines that begin with a new
### initial letter. This can help to make a
### large index more readable.
###
### FTP index files are simple text files of the
### form
###
### % Comment
### % Another comment
###
### Introductory text with no embedded tabs
### or leading percents.
###
### filenamedescription
### filename
### filenamedescription
### more description
###
### Lines that begin with one or more percents
### are formatted for verbatim display, with the
### leading percent(s), and exactly one following
### space (if present), removed.
###
### Each filename is embedded in an HREF
### hypertext link, making it easy for a human
### reader to view that file. Each time the
### basename of the filename changes, a NAME
### hypertext link is generated for the basename,
### allowing direct reference to that section of
### the file using a #basename suffix on the URL
### that points to the output file; this is
### convenient for use in other HTML files.
###
### Embedded tab characters are the key to
### differentiating the introductory text from
### the item descriptions. Once a line
### containing a tab is met, the remainder of the
### file is treated as the filename description
### section, with the exception of verbatim
### listings.
###
### There is also special support for index files
### of another format; they begin with the line
###
### # Alphabetical file listing
###
### and are simply converted to a verbatim
### listing. These are present in some FTP
### directories where the maintainer has not yet
### had time to create a customized index file.
###
### A modest attempt is made to prettyprint the
### output, but for best results, filter the
### output with html-pretty.awk.
###
### Although this program will read standard
### input if no filename is provided, it is
### better to use full path names on the input
### file, since the file directory and name are
### used in a comment header.
###
### The checksum field above contains a CRC-16
### checksum as the first value, followed by the
### equivalent of the standard UNIX wc (word
### count) utility output of lines, words, and
### characters. This is produced by Robert
### Solovay's checksum utility.",
### }
### ====================================================================
BEGIN {
VERSION = "0.12" # these MUST match version
DATE = "[19-Nov-2012]" # and date above
initialize()
html_banner()
html_header()
}
/^# Alphabetical file listing/ { verbatim_file(); next }
/^%/ { verbatim_line(); next }
/\t/ { description_line(); next }
{ introduction_line() ; next }
END { html_trailer() }
function basename(pathname)
{
# Remove any compression (.gz, .Z, .z) suffix from a file
# pathname, then return the basename, which is the remainder after
# removal of any dotted suffix. For example:
# foo -> foo
# foo. -> foo
# foo.tar -> foo
# foo.tar.Z -> foo
# foo.tar.gz -> foo
# foo.tar.gz -> foo
# foo.tar.z -> foo
# foo-1.23.1.tar -> foo-1.23.1
# First remove any compression suffix:
if (match(pathname,/[.][zZ]$/) || match(pathname,/[.]gz$/))
pathname = substr(pathname,1,RSTART-1)
if (match(pathname,/[.][^.]*$/)) # then reduce foo.xyz to foo
return (substr(pathname,1,RSTART-1))
else # no suffix: foo -> foo
return pathname
}
function begin_description_item( initial_letter,k,name,suffix)
{
initial_letter = substr(basename($1),1,1)
if (HEADER > 0)
{
if (HEADER > 6)
HEADER = 6
if (tolower(initial_letter) != tolower(substr(last_basename,1,1)))
{
if (in_description_list)
end_description_list()
print prefix(2) ""
print prefix(2) ""
if (!(initial_letter in Packages))
{
print prefix(3) ""
print prefix(4) initial_letter
print prefix(3) ""
Packages[initial_letter] = initial_letter
}
print prefix(2) ""
begin_description_list()
}
}
print prefix(3) "
"
name = basename($1)
if ((name != last_basename) && (!(name in Packages)))
{
last_basename = name
Packages[last_basename] = $1
print prefix(4) ""
}
else
print prefix(4) ""
print prefix(5) $1
print prefix(4) ""
name = package_name(last_basename)
if ((name != "") && !(name in Packages))
{
# Output a NAME anchor for this new package. Since nested
# anchors are illegal, we need a separate anchor, but we have
# no real contents for it. Some browsers fail to find the
# name if its contents are empty, or spaces, or a comment, but
# it appears that a non-breakable space does the trick. Since
# not all browsers recognize , we have to use the
# equivalent numeric entity.
print prefix(4) ""
print prefix(5) " "
print prefix(4) ""
Packages[name] = $1
}
print prefix(3) "
"
print prefix(3) "
"
if (NF > 1)
printf("%s", prefix(4))
for (k = 2; k <= NF; ++k)
printf("%s%s", $k, (k == NF) ? "\n" : " ")
in_description_item = 1
}
function begin_description_list()
{
if (HEADER == 0)
print prefix(2) ""
print prefix(2) "
"
in_verbatim = 1
}
function description_line()
{
if (in_verbatim)
end_verbatim()
if (!in_description_list)
begin_description_list()
make_escape_sequences()
if ($0 ~ /^[-a-zA-Z0-9]/)
{
end_description_item()
begin_description_item()
}
else
{
gsub(/^[ \t]+/, "", $0)
if ($0 != "") # discard empty lines
print prefix(5) $0
}
}
function end_description_item()
{
if (in_description_item)
{
print prefix(3) "
"
in_description_item = 0
}
}
function end_description_list()
{
end_description_item()
print prefix(2) ""
in_description_list = 0
}
function end_verbatim()
{
print prefix(2) ""
in_verbatim = 0
}
function html_banner()
{
print "" # for GNU Emacs mode selection
if (FILE != "")
print ""
print ""
print ""
print ""
print ""
if (FILE != "")
print ""
print ""
}
function html_header()
{
directory = FILE
gsub(/\/[^\/]+$/,"",directory) # trim filename, if any, to get directory
if (directory == "")
directory = "/" # but ensure that we at least a root directory
# Drop common leading path names to account for ftpd's chroot
sub(/^\/var\/spool\/ftp\//, "/", directory)
sub(/^\/usr\/spool\/ftp\//, "/", directory)
sub(/^\/u\/ftp\//, "/", directory)
print prefix(0) "\n"
print prefix(0) ""
print prefix(1) ""
print prefix(2) ""
print prefix(2) ""
print prefix(3) "Index of files in " directory
print prefix(2) ""
print prefix(2) ""
print prefix(1) ""
print prefix(1) ""
print prefix(2) "
"
print prefix(3) "Index of files in " directory
print prefix(2) "
"
}
function html_trailer()
{
if (in_description_list)
end_description_list()
if (in_verbatim)
end_verbatim()
print prefix(2) ""
print prefix(1) ""
print prefix(0) ""
}
function initialize( cmd)
{
if (FILE == "")
FILE = FILENAME
if (FILE == "-")
FILE = ""
if (INDENT == "")
INDENT = 4
in_description_list = 0
in_paragraph = 0
in_verbatim = 0
cmd = "awk '/^ *domain */ {print $2}' /etc/resolv.conf"
cmd | getline the_host
close(cmd)
if (the_host == "")
{
cmd = "uname -n"
cmd | getline the_host
close(cmd)
}
if (the_host == "")
{
cmd = "hostname"
cmd | getline the_host
close(cmd)
}
"date" | getline the_time
"echo $LOGNAME" | getline the_user
"cat /etc/passwd | awk -F: '/^" the_user ":/ {print $5}' | head -1" | \
getline the_person
if (the_person == "")
"ypcat passwd | awk -F: '/^" the_user ":/ {print $5}' | head -1" | \
getline the_person
}
function introduction_line()
{
if (in_verbatim)
end_verbatim()
if (in_description_list) # handle case of bare filename with no
description_line() # following description
else
{
make_escape_sequences()
if (in_paragraph)
{
if ($0 ~ /^[ \t]*$/)
{
print prefix(2) ""
print ""
in_paragraph = 0
}
else
print prefix(3) $0
}
else
{
print prefix(2) "
"
if ($0 ~ /^[ \t]*$/)
print ""
else
print prefix(3) $0
in_paragraph = 1
}
}
}
function make_anchors()
{
$0 = make_email_helper(make_anchors_helper($0))
}
function make_anchors_helper(s)
{
# See Internet RCS 1630 (June 1994) for details of
# URL syntax.
if (match(s,/[A-Za-z]+:\/\/[^{}\[\]\\^~<>`\" ]+/))
s = substr(s,1,RSTART-1) \
"" \
substr(s,RSTART,RLENGTH) \
"" \
make_anchors_helper(substr(s,RSTART+RLENGTH))
return (s)
}
function make_email_helper(s)
{
if (match(s,/[A-Za-z][A-Za-z0-9-]+@[A-Za-z][A-Za-z0-9-]*([.][A-Za-z][A-Za-z0-9-]*)+/))
s = substr(s,1,RSTART-1) \
"" \
substr(s,RSTART,RLENGTH) \
"" \
make_email_helper(substr(s,RSTART+RLENGTH))
return (s)
}
function make_escape_sequences()
{
gsub(/&/,"\\&")
gsub(/,"\\<")
gsub(/>/,"\\>")
gsub(/"/,"\\"")
}
function package_name(name)
{
# Convert distribution file basenames like foo-1.23.1 to the package
# name foo, by stripping suffixes of hyphen or underscore followed by
# digits, hyphens, underscores, and dots.
sub(/[-_][-_0-9.]+$/,"",name)
return (name)
}
function prefix(level)
{
# Return a prefix of up to 60 blanks
return (substr(" ",
1, INDENT * level))
}
function verbatim_file( hostname)
{
hostname = ""
begin_verbatim()
print $0
while (getline > 0)
{
make_escape_sequences()
make_anchors()
# If we can match a line like this:
# # Host: ftp.math.utah.edu
# we can convert it to a hypertext link
if ((NF == 3) && ($0 ~ /\# Host:/))
{
if (match($0, $3 "$") > 0)
{
hostname = $3
print substr($0,1,RSTART-1) "" $3 ""
}
else
print $0
}
# If we can match a line like this:
# # Directory: /pub/tex/pub/tgrind
# we can convert it to a hypertext link
else if ((NF == 3) && ($0 ~ /\# Directory:/))
{
if ((match($0, $3 "$") > 0) && (hostname != ""))
print substr($0,1,RSTART-1) "" $3 ""
else
print $0
}
# A typical file line looks like this:
# -rw-r--r-- 0 Nov 11 19:17 00tdir.cmd
# -rw-rw-r--+ 1133099 Nov 16 10:40 lanczos-cornelius.html
# If we match this pattern, we supply an anchor around the
# filename, so that it can be easily selected in a WWW client.
else if ((NF == 6) && \
($1 ~ /^[-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][-rwxd][+]?$/))
{
if (match($0, $6 "$") > 0)
print substr($0,1,RSTART-1) "" $6 ""
else
print $0
}
else
print $0
}
end_verbatim()
}
function verbatim_line()
{
if (in_description_list)
end_description_list()
if (!in_verbatim)
begin_verbatim()
gsub(/^%+/,"")
gsub(/^ /,"")
make_escape_sequences()
make_anchors()
print $0
}