urls-to-wget.gawk 6 KB
Newer Older
1 2
#! /usr/bin/gawk -f
#
Russ Fish's avatar
Russ Fish committed
3 4 5 6
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
# All rights reserved.
#
7 8
# urls-to-wget - Generate a wget script for a set of URL's.  The script
# assumes you are already logged in to Emulab, with a valid cookies.txt file.
9
#
10 11
#   Input is a set of page URL's including appended ?args, from sep-urls.gawk .
#   Interspersed action lines for setup and teardown may be prefixed by a
12 13 14 15 16 17
#   "!" or "-".  Further description of these action lines is below.
#
#   Output is a csh script containing "wget" statements to simulate submitting
#   HTML forms responses to drive the web site, interspersed with other state
#   query and control structure lines.  You may run the whole thing en masse,
#   or copy-paste individual commands into an interactive c-shell.
18
#
19 20
#   The GET arg method is default, including action= args for a POSTed form.
#   A POST argument string follows a "?post:" separator after the other ?args.
21
#
Russ Fish's avatar
Russ Fish committed
22 23
#   A -v COOKIES= awk arg gives the path to an alternate cookies.txt file.
#   A -v OUTDIR= awk arg gives the path to an alternate output directory.
Russ Fish's avatar
Russ Fish committed
24
#        (Otherwise, .html output files go into the current directory.)
25
#   A -v SRCDIR= awk arg gives the path to the script source directory.
Russ Fish's avatar
Russ Fish committed
26
#   A -v FAILFILE=failure.txt awk arg enables generation of conditional
27
#        probe inverse "-" action lines.  See below.
Russ Fish's avatar
Russ Fish committed
28
#
Russ Fish's avatar
Russ Fish committed
29 30
#   Action lines in the input stream may be prefixed by a "!" or a "-".
#   . "!" lines are just put into the output script among the wget lines.
31
#   .  "-" lines are *conditional undo* (inverse) actions for probing.
32
#
Russ Fish's avatar
Russ Fish committed
33 34 35 36 37
#   A conditional undo action line immediately follows a /filename line in the
#   file list.  It gives a corresponding "inverse action" to conditionally
#   undo the action of the probed page when its execution *DOESN'T FAIL*.
#   This is useful when the probe value given for a specific input field is
#   ignored.  For example, the first beginexp probe that succeeds uses up the
38
#   experiment name and blocks all other probes, so the experiment has to be
Russ Fish's avatar
Russ Fish committed
39
#   deleted again by the inverse action before the next probe is done.
40
#
Russ Fish's avatar
Russ Fish committed
41 42 43 44 45
#   Undo action "-" lines are wrapped in an "if" test in the generated script
#   so they only run if the preceding probe wget output file DOES NOT match
#   any grep pattern in failure.txt .  NOTE: This will only work reliably with
#   a completed failure.txt pattern file, i.e. there are no remaining
#   "UNKNOWN" entries in the analyze_output.txt file.
46 47
#
#   -wget lines will have the rest of the URL and option arguments filled in.
48 49
#   !sql or -sql lines are quoted queries that are passed to the inner boss tbdb.
#   !varnm=sql is a variant for getting stuff from the DB into a shell variable.
50
#   Other "-" or "!" action lines are put into the shell script verbatim.
51 52 53 54

BEGIN{
    verbose = "-S ";

Russ Fish's avatar
Russ Fish committed
55
    if ( COOKIES == "" ) COOKIES = "cookies.txt";
56
    ld_cookies	= "--load-cookies " COOKIES;
Russ Fish's avatar
Russ Fish committed
57 58 59 60

    outpath = OUTDIR;
    if ( length(outpath) && !match(outpath, "/$") ) outpath = outpath "/";

61 62 63 64
    # Don't get prerequisites (-p) so we can redirect the output page (-O).
    wget_args	= verbose "-k --keep-session-cookies --no-check-certificate"
}

65 66 67 68 69
# Action lines.
/^!/ || /^-/ {
    type = substr($0, 1, 1);
    if ( $0 ~ /^.wget / ) {
	process_url(last_prefix "/" $2); # Sets url, url_args, post_args.
70
        # Put the undo output into a separate subdir to avoid confusion.
71 72 73 74 75 76
	file_args = "-O " outpath "undo/" last_file;
	cmd = sprintf("wget %s %s %s%s %s", 
		      wget_args, ld_cookies, file_args, post_args, url_args);
    }
    else if ( $0 ~ /^.sql / ) \
	cmd = "echo " substr($0, 5) "| ssh $MYBOSS mysql tbdb";
77 78 79
    else if ( match($0, /^.(\w+)=sql (.*)/, s ) ) \
	cmd = "set " s[1] "=`echo " s[2] "| ssh $MYBOSS mysql tbdb | tail +2`" \
	      "; echo \"    " s[1] " = $" s[1] "\""; # Show the value too.
80 81 82 83
    else cmd = substr($0, 2);

    # Unconditional action lines start with an exclamation point.
    if ( type == "!" ) { print "    " cmd; }
84

85
    # Conditional "inverse action" lines start with a dash.
86 87 88 89
    # Only need to undo when probing, and only after lines containing a probe.
    # (The others are the real setup/teardown actions that allow continuing.)
    if ( type == "-" && length(FAILFILE) && last_url_was_probe ) {
	# Do the undo only when the thing being undone didn't fail.
90 91 92 93 94
	printf "if ( ! { grep -q -f %s/%s %s%s } ) then\n    %s\nendif\n",
	    SRCDIR, FAILFILE, outpath, last_file, cmd;
    }
    next;
}	
Russ Fish's avatar
Russ Fish committed
95

96
function process_url(u) {	# Sets url, url_args, post_args.
97 98 99 100
    # Encode a few characters as %escapes.
    gsub(" ", "%20");
    gsub("!", "%21");
    gsub("\"", "%22");
101 102 103
    gsub("#", "%23");
    gsub("\\$", "%24");
    ###gsub("/", "%2F");
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118
    # Separate off a "?post:" argument string at the end of the URL.
    url = u;
    if ( post = match(url, "?post:") != 0 ) {
	post_args = sprintf(" --post-data \"%s\"", substr(url, RSTART+6));
	url = substr(url, 1, RSTART-1);
	##printf "URL %s, POST_ARGS %s\n", url, post_args;
    }
    else post_args = "";
    url_args = sprintf("\"%s\"", url);
}

# URL lines.
{
    process_url($0);
119

120 121 122 123 124 125 126 127 128
    # Parse the URL string into the host/filename, ignoring optional ?GETargs.
    # "?GETargs" could be in a simple pattern, except for slashes in the values.
    u = url; g = "";
    if ( q = index(u, "?") ) {
	u = substr(url, 1, q-1);
	g = substr(url, q+1);
    }
    match(u, "^(http.*)/(.*)", p);
    ##printf "URL: [1-host] %s, [2-file] %s, [3-GETargs] %s\n", p[1], p[2], g;
129

Russ Fish's avatar
Russ Fish committed
130 131
    # Make a local destination file, with a numeric suffix if needed.
    # (We may hit the same page many times when probing.)
132 133 134
    prefix = last_prefix = p[1];
    file = p[2];
    suffix = files[file];	# Null string initially, then 1, 2, 3...
135 136
    files[file]++;		# Increment for next time.
    if ( suffix ) file = file "." suffix;
137
    file = last_file = file ".html";	# html suffix for web browser.
Russ Fish's avatar
Russ Fish committed
138
    file_args = "-O " outpath file;
139

140
    print "wget", wget_args, ld_cookies, file_args post_args, url_args;
141 142 143

    # Remember whether this was a probing URL or an setup/teardown action URL.
    last_url_was_probe = index(post_args url_args, "**{")
144
}