urls-to-wget.gawk 5.23 KB
Newer Older
1 2
#! /usr/bin/gawk -f
#
Russ Fish's avatar
Russ Fish committed
3 4 5 6
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
# All rights reserved.
#
7 8
# urls-to-wget - Generate a wget script for a set of URL's.  The script
# assumes you are already logged in to Emulab, with a valid cookies.txt file.
9
#
10 11
#   Input is a set of page URL's including appended ?args, from sep-urls.gawk .
#   Interspersed action lines for setup and teardown may be prefixed by a
Russ Fish's avatar
Russ Fish committed
12
#   "!" or "-".  Further description of action lines is below.
13
#
14 15
#   The GET arg method is default, including action= args for a POSTed form.
#   A POST argument string follows a "?post:" separator after the other ?args.
16
#
Russ Fish's avatar
Russ Fish committed
17 18
#   A -v COOKIES= awk arg gives the path to an alternate cookies.txt file.
#   A -v OUTDIR= awk arg gives the path to an alternate output directory.
Russ Fish's avatar
Russ Fish committed
19
#        (Otherwise, .html output files go into the current directory.)
20
#   A -v SRCDIR= awk arg gives the path to the script source directory.
Russ Fish's avatar
Russ Fish committed
21
#   A -v FAILFILE=failure.txt awk arg enables generation of conditional
22
#        probe inverse "-" action lines.  See below.
Russ Fish's avatar
Russ Fish committed
23
#
Russ Fish's avatar
Russ Fish committed
24 25 26
#   Action lines in the input stream may be prefixed by a "!" or a "-".
#   . "!" lines are just put into the output script among the wget lines.
#   .  "-" lines are *conditional undo* actions for probing.
27
#
Russ Fish's avatar
Russ Fish committed
28 29 30 31 32
#   A conditional undo action line immediately follows a /filename line in the
#   file list.  It gives a corresponding "inverse action" to conditionally
#   undo the action of the probed page when its execution *DOESN'T FAIL*.
#   This is useful when the probe value given for a specific input field is
#   ignored.  For example, the first beginexp probe that succeeds uses up the
33
#   experiment name and blocks all other probes, so the experiment has to be
Russ Fish's avatar
Russ Fish committed
34
#   deleted again by the inverse action before the next probe is done.
35
#
Russ Fish's avatar
Russ Fish committed
36 37 38 39 40
#   Undo action "-" lines are wrapped in an "if" test in the generated script
#   so they only run if the preceding probe wget output file DOES NOT match
#   any grep pattern in failure.txt .  NOTE: This will only work reliably with
#   a completed failure.txt pattern file, i.e. there are no remaining
#   "UNKNOWN" entries in the analyze_output.txt file.
41 42
#
#   -wget lines will have the rest of the URL and option arguments filled in.
43 44
#   !sql or -sql lines are quoted queries that are passed to the inner boss tbdb.
#   !varnm=sql is a variant for getting stuff from the DB into a shell variable.
45
#   Other "-" or "!" action lines are put into the shell script verbatim.
46 47 48 49

BEGIN{
    verbose = "-S ";

Russ Fish's avatar
Russ Fish committed
50
    if ( COOKIES == "" ) COOKIES = "cookies.txt";
51
    ld_cookies	= "--load-cookies " COOKIES;
Russ Fish's avatar
Russ Fish committed
52 53 54 55

    outpath = OUTDIR;
    if ( length(outpath) && !match(outpath, "/$") ) outpath = outpath "/";

56 57 58 59
    # Don't get prerequisites (-p) so we can redirect the output page (-O).
    wget_args	= verbose "-k --keep-session-cookies --no-check-certificate"
}

60 61 62 63 64 65 66 67 68 69 70 71
# Action lines.
/^!/ || /^-/ {
    type = substr($0, 1, 1);
    if ( $0 ~ /^.wget / ) {
	process_url(last_prefix "/" $2); # Sets url, url_args, post_args.
        # Put the undo output in a separate subdir to avoid confusion.
	file_args = "-O " outpath "undo/" last_file;
	cmd = sprintf("wget %s %s %s%s %s", 
		      wget_args, ld_cookies, file_args, post_args, url_args);
    }
    else if ( $0 ~ /^.sql / ) \
	cmd = "echo " substr($0, 5) "| ssh $MYBOSS mysql tbdb";
72 73 74
    else if ( match($0, /^.(\w+)=sql (.*)/, s ) ) \
	cmd = "set " s[1] "=`echo " s[2] "| ssh $MYBOSS mysql tbdb | tail +2`" \
	      "; echo \"    " s[1] " = $" s[1] "\""; # Show the value too.
75 76 77 78 79 80 81 82 83 84 85 86
    else cmd = substr($0, 2);

    # Unconditional action lines start with an exclamation point.
    if ( type == "!" ) { print "    " cmd; }
    # Conditional "inverse action" lines start with a dash.
    else if ( length(FAILFILE) ){ # Only need to undo when probing.
	# Wrap in an if-block.
	printf "if ( ! { grep -q -f %s/%s %s%s } ) then\n    %s\nendif\n",
	    SRCDIR, FAILFILE, outpath, last_file, cmd;
    }
    next;
}	
Russ Fish's avatar
Russ Fish committed
87

88
function process_url(u) {	# Sets url, url_args, post_args.
89 90 91 92
    # Encode a few characters as %escapes.
    gsub(" ", "%20");
    gsub("!", "%21");
    gsub("\"", "%22");
93 94 95
    gsub("#", "%23");
    gsub("\\$", "%24");
    ###gsub("/", "%2F");
96

97 98 99 100 101 102 103 104 105 106 107 108 109 110
    # Separate off a "?post:" argument string at the end of the URL.
    url = u;
    if ( post = match(url, "?post:") != 0 ) {
	post_args = sprintf(" --post-data \"%s\"", substr(url, RSTART+6));
	url = substr(url, 1, RSTART-1);
	##printf "URL %s, POST_ARGS %s\n", url, post_args;
    }
    else post_args = "";
    url_args = sprintf("\"%s\"", url);
}

# URL lines.
{
    process_url($0);
111

112
    # Parse the URL string.  "?args" would be simple, except that
113
    # there can be slashes in them, causing the host part to stretch.
114
    match(url, "^(http.*)/([^?]*)[?]?(.*)", p);
115 116
    ##printf "URL: [1] %s, [2] %s, [3] %s\n", p[1], p[2], p[3];

Russ Fish's avatar
Russ Fish committed
117 118
    # Make a local destination file, with a numeric suffix if needed.
    # (We may hit the same page many times when probing.)
119 120 121
    prefix = last_prefix = p[1];
    file = p[2];
    suffix = files[file];	# Null string initially, then 1, 2, 3...
122 123
    files[file]++;		# Increment for next time.
    if ( suffix ) file = file "." suffix;
124
    file = last_file = file ".html";	# html suffix for web browser.
Russ Fish's avatar
Russ Fish committed
125
    file_args = "-O " outpath file;
126

127
    print "wget", wget_args, ld_cookies, file_args post_args, url_args;
128
}