#!/usr/bin/perl -wT
# xsss - Cross Site Scripting Scanner
# version 0.40
# (c) 2005 Sven Neuhaus <sn@heise.de>
# For license information, read the included file "GPL".

# TODO v1:
#	* https <-> http switches on same server
#	* circumvent HTML::Form restrictions on select/checkbox/radio input

use strict;
use WWW::Mechanize;
use Getopt::Long;
use URI::QueryParam;
use Data::Dumper;
#use Pod::Usage; # TODO
use constant DEBUG => 1;

### Prototypes
sub browse_url ($$;$$);

#### Globals
my $Mech;
my %Opts = (
	forms => '',		# form scanning is quite intrusive, disabled by default
	pathinfo => '',		# TODO, so disabled
	queries => 1,		# URL garbling enabled by default
	depth => 5,			# levels of recursion
	maxcount => 1000,	# number of requests
);
my %History; # list of visited URLs
my $Base_url;
my $XSS_count = 0; # XSS found
my $Req_count = 0; # Request counter

### Functions

sub main {
	select(STDERR); $|=1;
	select(STDOUT); $|=1;
	GetOptions(\%Opts, 'forms!', 'queries!', 'pathinfo!', 'list=s', 'depth=i', 'maxcount=i', 'help') or 
		usage();
	usage() if $Opts{help};
	usage() unless @ARGV;
	$Mech = WWW::Mechanize->new();
	$Mech->agent_alias('Linux Mozilla'); # TODO: make this a command line option
	my $base_url;
	if ($Opts{list}) {
		my $fh;
	    open( $fh, '<', $Opts{list} ) or 
			die "Can't open URL List '$Opts{list}': $!\n";
		# TODO(?) if file contains relative URLs, prepend baseurl to each one
		push @ARGV, map { chomp; $_ } <$fh>; # append to list of URLs
		close($fh);
	}
	while($Base_url = shift(@ARGV)) {
		warn "\nstarting at '$Base_url'\n" if DEBUG;
		browse_url($Base_url, 0, { start => 1 });
		last if $Req_count > $Opts{maxcount};
	}
	print STDERR "\nDone. Found $XSS_count suspects in $Req_count requests\n";
}

sub usage {
	print STDERR qq{
 Usage:
  $0 [options] url ...

 Valid options:
\t--forms, --queries, --pathinfo --list=file --depth=n --help --maxcount=n

\t--forms enables forms scanning (disabled by default)

\t--noqueries disables query string scanning (enabled by default)

\t--list=filename scans list of URLs from file

\t--depth=n mit levels of recursion. default is 5.

\t--maxcount=n Limit number of request. Default 1000

};
	exit(1);
	# TODO: add pod, use Pod::Usage
}

# TODO: split  browse_form from browse_url

# Visits a URL (and links in the resulting page)
# Parameters:
#	url as string or HTML::Response
#	recursion depth (start with 0)
#	scanstring (hashref, key "body") or key 'start'
sub browse_url ($$;$$) {
	my ($url, $depth, $scan, $form_url) = @_;
	if ($depth > $Opts{depth}) { # max recursions reached
		warn "too deep ($depth), skipping '$url'\n" unless $form_url;
		return;
	}
	if ($Req_count > $Opts{maxcount}) { # max req reached
		warn "\nmaxcount requests reached. \n";
		return;
	}
	my $resp;
	if (defined $form_url) { # called with Response, not URL (used for forms)
		$resp = $url;
		$url = $form_url;
	} else {
		print STDERR "browsing url '$url':" if DEBUG;
		if (defined $scan) {
			$resp = $Mech->get($url);
			undef $scan if exists $scan->{start}; # special case for first req
		} else {
			$resp = $Mech->get($url);
			#$resp = $Mech->follow_link(url_abs => $url);
			#warn "no resp for URL '$url'\n" unless defined $resp;
		}
	}
	$Req_count++;
	print STDERR $resp->code . ' ' . $resp->message . " \r" if DEBUG;

	if(defined $scan) {
#		$Mech->back();
		return scan_result($url, $scan); # no recursion when scanning
	}

	LINK:
	foreach my $link ($Mech->links) {
		my $link_uri = $link->URI;
		next LINK unless 
			index($link->url_abs, $Base_url) == 0; # stay on site/subdir
		next LINK unless want_to_visit($link_uri); # clears fragment

		if ($Opts{queries} && $link_uri->equery) { # scan for XSS in query string
			warn "\nscanning in link '" . $link_uri->abs . "'\n" if DEBUG > 2;

			# TODO: 
			# 	permute matching the way it appears in the output
			#	unpermuted. I.e. when in quoted string, try ", when
			#	in general text, try "<" etc.
			foreach my $new_url (permute_link($link_uri, '<')) {
				next unless want_to_visit($new_url->[0]);
				browse_url($new_url->[0]->abs, $depth+1, 
					{ body => $new_url->[1] });
				return if $Req_count > $Opts{maxcount};
			}

			foreach my $new_url (permute_link($link_uri, q{"})) {
				next unless want_to_visit($new_url->[0]);
				browse_url($new_url->[0]->abs, $depth+1, 
					{ body => $new_url->[1] });
				return if $Req_count > $Opts{maxcount};
			}
		}
		browse_url($link_uri->abs, $depth+1);
		return if $Req_count > $Opts{maxcount};
	}
	if ($Opts{forms}) { # TODO fill in and submit forms
		# Do we want to submit unaltered forms? (only if not text fields!?)
		my $form_num = 0; # count forms
		foreach my $form ($Mech->forms) {
			$form_num++;
			next unless want_to_visit($form);

			# fill one field at a time with control characters and submit it
			foreach my $input ($form->inputs) {
				# HTML::Form won't let us set arbitrary values on them so we
				# skip them (only for now!)
				next if $input->type =~ /^radio|checkbox|option$/;
				next unless defined $input->name;
				my $rstr = random_ctrl_string(q{"});
				warn "\nsubmitting form $form_num with name='" . $input->name . 
					"'='$rstr'\n" if DEBUG;
				my $resp = $Mech->submit_form(
					form_number	=> $form_num,
					fields		=> { $input->name => $rstr }
				);
				warn "\nsubmitted!\n" if DEBUG;
				browse_url($resp, $depth+1, { body => $rstr }, form_to_url($form));
				return if $Req_count > $Opts{maxcount};
			}
			# TODO: fill all form fields, one of them with a control-char-string
			
			$form_num++;
		}
	}
	if ($Opts{pathinfo}) { # TODO append evil stuff to path
		# recognize CGIs: cgi-bin/bin/dyn/.cgi/.pl/.php/...
		# if url ends in "/"
		#	append ", <
		# else
		#	append /", /<
	}
#	$Mech->back(); # done recursing
}

# scan result for our parameters
sub scan_result ($$) {
	my($url, $scan) = @_;
	if ($scan->{body}) {
		my $offset = index($Mech->content, $scan->{body});
		if ($offset != -1) {
			print "\n### Possible XSS in Content:\n" .
				"\turl   : $url\n" .
				"\toffset: $offset\n" .
				"\ttext  : $scan->{body}\n";
			$XSS_count++;
		}
	} # TODO elsif $scan-{header}
}

# decide wether or not to visit a URL
# TODO:
# 	detect when certain scripts get called with varying parameters real
# 	often, like viewtopic on a forum script (and stop visiting them)
# TODO:
#	form handling (GET + POST)
sub want_to_visit ($) {
	my $url = shift;
	if (ref($url) eq 'URI::URL') {
		$url->fragment(undef); # clear fragment, if any
		# we don't want certain boring fileformats
		return if $url->path =~ /\.(?:exe|gif|jpg|jpeg|png|bmp|doc|txt|pdf|css|swf|zip|ace|rar|lha|lzh|tar|tar\.gz|tar.bz2|mov|mpg|mpeg|avi|mp4|wav|flac|mp3|ogg|ts|js|css)$/i;
		# we do want *HTML, PHP, PHP4, XML, ...
		# return unless $url->path =~ /\.(?:php|xml|.?html)$/;
		# don't get X variants of the same Apache-Autodir
		return if defined $url->equery &&
			$url->equery =~ /^[DSMN]=[AD]$/; # Apache Autoindex
		return if exists $History{$url->abs};
		return $History{$url->abs} = 1;

	# Forms
	} elsif (ref($url) eq 'HTML::Form') {
		# convert form into query string
		my $form_url = form_to_url($url);
		return if exists $History{$form_url};
		return $History{$form_url} = 1;
	}
}

# equivalent URL for a Form, regardless of HTTP-method
sub form_to_url ($) {
	my $form = shift;
	my $d = $form->action . '?' . join('&',
		map { defined $_->name 
				? $_->name . '=' . (defined($form->value($_->name))
					? $form->value($_->name)
					: '')
				: ()
			} $form->inputs);
	return $d;
}

# returns List of URLs where each parameter in turn has been
# garbled with HTML control chars
sub permute_link ($$) {
	my ($uri, $char) = @_;
	$uri->fragment(undef); # clear fragment, if any
	my @url_list;
	foreach my $key ($uri->query_param) {
		# TODO parameters with multiple values a la ?foo=1&foo=2
		my $u2 = $uri->clone;
		#my $value = $uri->query_param($key) . $char;
		my $value = random_ctrl_string($char);
		$u2->query_param($key => $value);
		push @url_list, [ $u2, $value ];
	}
	return @url_list;
}

sub random_ctrl_string ($) {
	my $char = shift;
	return (10_000 + int(rand(90_000))) . $char . 
			(10_000 + int(rand(90_000))); # 5 digits, char, 5 digits
}

main();


#eof. This file has not been truncate
