forms-to-urls.gawk 8.21 KB
Newer Older
1 2 3
#! /usr/bin/gawk -f
#
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23
#
24
# forms-to-urls.gawk - Generate URL's for accessing the site.
25 26 27
#
#   form-input.gawk's output format is the input format for this script.
#
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
#   A site_values.list file path is provided by a -v VALUES=... awk arg.
#
#   . Contents are 'name="..." value'.  An optional value (separated by a
#     space character, extending to the end of line) is the default used for
#     auto-form-fill-in.  Names may be of the form array[element] as in PHP.
#
#   . Specifying !name="..." (an exclamation-point prefix) causes matching
#     input fields to be skipped.  If the name ends with a "*" then the name
#     is a prefix and all names _starting with_ that string are skipped.  Note
#     that "[" characters in names are not treated specially, so e.g. you can
#     skip a whole array by specifying !name="...[*" .
#
#   . The value may be prefixed with a ! to cause it to over-ride an action=
#     argument in the form page URL.  It follows that a value of just an "!"
#     specifies a null string value.
#
#   . The value may contain a %d, which is replaced with a disambiguating number
#     for argument values after the first.  (The first one just gets a null
#     string, as in the output file names in urls-to-wget.gawk .)  This is
#     useful for probing, where multiple probes will be generated for a single
#     page but the values can conflict.
49 50
#
#   Output is a set of page URL's including appended ?args.
51 52
#   The GET arg method is default, including action= args for a POSTed form.
#   A POST argument string follows a "?post:" separator after the other ?args.
53 54 55
#
#   A -v MAX_TIMES= awk arg specifies how many times to target a form.
#
Russ Fish's avatar
Russ Fish committed
56 57 58 59
#   A -v PROBE=1 awk arg turns on SQL injection probing.  A separate URL is
#   generated for each ?argument, substituting a labeled mock SQL injection
#   attack probe string for the proper value.
#
60 61 62 63 64 65
BEGIN {
    if ( ! MAX_TIMES ) MAX_TIMES = 1; # Default.

    while ( getline <VALUES ) {
	arg_name = $1;
	arg_name = gensub("name=\"([^\"]*)\"", "\\1", 1, arg_name);
66 67 68 69 70 71 72
	if ( substr($1, 1, 1) == "!" ) {
	    arg_name = substr(arg_name, 2);
	    ##print "not", arg_name;
	    if ( substr(arg_name, length(arg_name)) == "*" )
		skip_prefix[substr(arg_name, 1, length(arg_name)-1)] = 1;
	    else skip_name[arg_name] = 1;
	}
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
	if (NF > 1)
	    defaults[arg_name] = substr($0, index($0, $2));
	##printf "defaults %s=%s.\n", arg_name, defaults[arg_name];
    }
}

/^[.][\/]/ {			# A page file section starts with a filename.
    # Remember the host path from filenames.  Often not on <form action= .
    host_path = gensub("^[.][/](.*)/.*", "\\1", 1);
    ##print "host_path", host_path;
}

/^<form/ {			# <form action="..." method="..."
    action = gensub(".* action=\"([^\"]*)\".*", "\\1", 1);
    method = gensub(".* method=\"([^\"]*)\".*", "\\1", 1);

Russ Fish's avatar
Russ Fish committed
89
    # Action= URL can have args specified.  Use the values over anything else,
90 91
    # unless the default value is prefixed with a ! .  Keep them separate from
    # POST args because PHP code may get them through the $_GET array.
92
    url = action;
Russ Fish's avatar
Russ Fish committed
93
    action_file = gensub(".*/", "", 1, gensub("?.*", "", 1, url));
94
    delete args; delete action_args;
95 96
    if ( q = index(action, "?") ) {
	url = substr(action, 1, q-1);
Russ Fish's avatar
Russ Fish committed
97

98 99 100 101 102 103 104
	# The "&" arg separator is escaped in HTML.
	n = split(substr(action, q+1), url_args, "&amp;");
	for (i = 1; i <= n; i++) {
	    name_val = url_args[i];
	    eq = index(name_val, "=");
	    nm = substr(name_val, 0, eq-1);
	    vl = substr(name_val, eq+1);
105 106 107 108 109 110 111

	    # Input fields to be skipped.
	    if ( skip_name[nm] ) continue;
	    for (j = 1; j <= length(nm); j++ )
		if ( skip_prefix[substr(nm, 1, j)] ) continue;

	    action_args[nm] = vl;
112 113 114 115

	    # A default with a ! prefix over-rides an action= arg.
	    df = defaults[nm];
	    if ( df ~ "!" )
116
		action_args[nm] = substr(df, 2);
117 118 119 120 121 122 123
	    ##printf "name_val %s, nm %s, vl %s, df %s\n", name_val, nm, vl, df;
	}
    }

    # Add host path to relative url's.
    if (! index(url, ":") ) url = "https://" host_path "/" url;
    
Russ Fish's avatar
Russ Fish committed
124
    ##printf "url %s, file %s, method %s, action args", url, action_file, method;
125
    ##for (i in action_args) printf " %s", action_args[i]; printf "\n";
126 127 128 129 130 131 132 133 134 135 136 137 138

    target[url]++;
    form = target[url] <= MAX_TIMES; # Limit target hits.
    arg_vals = 0;		# Count arguments with user provided values.
}

form && /^<input/ {		# <input type="..." name="..." value=... ...>
    # Gotta have a name to be an arg.
    if ( $0 !~ " name=" ) next;

    # Type and name have been double-quoted.  Value can be single- or double-.
    type = gensub(".* type=\"([^\"]*)\".*", "\\1", 1);
    name = gensub(".* name=\"([^\"]*)\".*", "\\1", 1);
139 140 141 142 143 144

    # Input fields to be skipped.
    if ( skip_name[name] ) next;
    for (j = 1; j <= length(name); j++ )
	if ( skip_prefix[substr(name, 1, j)] ) next;

145 146 147 148 149
    if ( $0 ~ " value=\"" )
	value = gensub(".* value=\"([^\"]*)\".*", "\\1", 1);
    else if ( $0 ~ " value='" )
	value = gensub(".* value='([^']*)'.*", "\\1", 1);
    else value = "";
Russ Fish's avatar
Russ Fish committed
150 151
    checked = $0 ~ "\\<checked\\>";
    ##printf "type %s, name %s, value %s, checked %s\n", type, name, value, checked;
152 153

    val_arg = (type=="text" || type=="textarea" || type=="password" || 
154 155
	       type=="hidden" || type=="select" || 
	       (type=="checkbox" || type=="radio") && checked);
156 157 158 159 160 161
    # Follow just the positive submit controls, not cancel, etc.
    sub_arg = (type=="submit" && 
	       (value ~ "Submit" || value ~ "Create" || 
		value=="Confirm" || value=="Go!"));

    if ( val_arg || sub_arg ) {
162 163 164
	##printf "name %s, default=%s, value=%s.\n", 
	##       name, defaults[name], value;
	df = defaults[name];
165 166 167
	if ( df != "" ) {
	    # Default value from VALUES file.  May have ! prefix.
	    if ( df ~ "!" )
168
		args[name] = substr(df, 2);
169
	    else
170
		args[name] = df;
171 172 173
	}
	else if ( value != "" )
	    # Value from <input field default.
174
	    args[name] = value;
175
	else
176
	    args[name] = "";
177

178
	if ( args[name] ) arg_vals++;
179 180 181 182
    }
}

form && /^$/ {			# Blank line terminates each form section.
183 184 185 186 187 188 189 190 191 192 193 194 195 196

    # Collect the arg strings, with action args first.
    arg_str = ""; n_args1 = n_args2 = 0;
    for (arg in action_args) {
	sep = ( n_args1==0 ? "?" : "&" );
	arg_str = arg_str sep arg "=" action_args[arg];
	n_args1++;
    }
    for (arg in args) {  # Form input field args, may be POSTed.
	if ( n_args2 != 0 ) sep = "&";
	else if ( method == "post" ) sep = "?post:";
	else sep = ( n_args1 == 0 ? "?" : "&" );
	arg_str = arg_str sep arg "=" args[arg];
	n_args2++;
197 198
    }

Russ Fish's avatar
Russ Fish committed
199 200
    if (arg_vals) {		# Ignore if no argument values to supply.

201 202 203
	if ( PROBE ) {
	    # When probing, generate N probe urls.  Substitute a labeled mock SQL
	    # injection attack probe string for one ?argument value in each URL.
204 205 206 207
	    delete all_args;
	    for (arg in action_args) all_args[arg] = action_args[arg];
	    for (arg in args) all_args[arg] = args[arg];
	    for (arg in all_args) {
Russ Fish's avatar
Russ Fish committed
208
		lbl = "**{" action_file ":" arg "}**";
209 210 211 212 213

		# Disambiguating number for %d.  Null string for the first one.
		dn_str = gensub("%d", dnum++, "g", arg_str);

		# Quote regex metachars in array argument names for matching.
Russ Fish's avatar
Russ Fish committed
214
		a = gensub("\\[", "\\\\[", 1, gensub("\\]", "\\\\]", 1, arg));
215
		a = gensub("\\$", "\\\\$", "g", a);
Russ Fish's avatar
Russ Fish committed
216 217

		# Notice the single-quote at the head of the inserted probe string.
218 219 220
		probe_str = gensub("(\\<" a ")=([^?&]*)", "\\1='" lbl, 1, dn_str);

		print url probe_str;
Russ Fish's avatar
Russ Fish committed
221 222
	    }
	}
223 224 225 226 227 228 229

	# Not probing, or finished with probe URLs.  Put out the unmodified URL
	# *after* the probe URLs, since dependent actions later on will need the
	# results of a setup/teardown action.
	gsub("%d", "", arg_str);
	print url arg_str;

Russ Fish's avatar
Russ Fish committed
230 231
    }
}