# 2013-03-04 johnpfeiffer
# python downloaders.py 1024 nonthreaded yahoo.com slashdot.org apple.com kittyandbear.net ibm.com msn.com = 2.5 seconds
# python downloaders.py 1024 threaded yahoo.com slashdot.org apple.com kittyandbear.net ibm.com msn.com = .7 seconds
# python downloaders.py 1024 multiprocess yahoo.com slashdot.org apple.com kittyandbear.net ibm.com msn.com = 1.5 seconds
import urllib2
import time
import sys
import Queue # synchronized data structure
import threading
import multiprocessing
class Worker( multiprocessing.Process ):
def __init__( self , url_queue , bytes_to_download ):
multiprocessing.Process.__init__( self )
self.url_queue = url_queue
self.bytes_to_download = bytes_to_download
def run( self ):
while True:
url = self.url_queue.get()
if url == None:
print '%s exiting' % self.name
self.url_queue.task_done()
break
print '\ndownloading %s' % url
urlsock = urllib2.urlopen( url )
print urlsock.read( self.bytes_to_download )
urlsock.close()
self.url_queue.task_done()
return
class ThreadedDownloader( threading.Thread ): # extend the thread class
def __init__( self , url_queue , bytes_to_download ): # fork-join design - each thread has the whole data set
threading.Thread.__init__( self )
self.url_queue = url_queue
self.bytes_to_download = bytes_to_download
def run( self ):
while True:
url = self.url_queue.get()
print '\ndownloading %s' % url
urlsock = urllib2.urlopen( url )
print urlsock.read( self.bytes_to_download )
urlsock.close()
self.url_queue.task_done()
def multiprocess( urls , bytes_to_download ):
cpus = multiprocessing.cpu_count()
cpus = 2
print 'cpus = %s' % cpus
url_queue = multiprocessing.JoinableQueue()
jobs = []
for i in range( cpus ): # start consumers (they wait for the 'poison pill' None)
p = Worker( url_queue , bytes_to_download )
jobs.append( p )
p.start()
for url in urls:
url_queue.put( url ) # add the tasks to be done into the shared Queue
for i in xrange( cpus ):
url_queue.put( None ) # poison pills to ensure all processes quit
url_queue.join() # wait for all queued worker jobs to complete?
def threaded( urls , bytes_to_download ):
url_queue = Queue.Queue()
for url in urls:
url_queue.put( url )
for i in range( 5 ): # create 5 threads (so for max performance there should be 5 hosts)
my_thread = ThreadedDownloader( url_queue , bytes_to_download )
my_thread.setDaemon( True ) # does not block the main program from exiting, must use a join() to block
my_thread.start() # starts this thread
url_queue.join() #wait on the queue until all threads mark task done, can be join( 1 ) for max wait 1 second
def non_threaded( urls , bytes_to_download ):
for url in urls:
print '\ndownloading %s' % url
urlsock = urllib2.urlopen( url )
print urlsock.read( bytes_to_download )
urlsock.close()
def convert_dns_name_to_url( hosts ):
urls = list()
for host in hosts:
urls.append( ''.join( [ 'http://' , host ] ) )
return urls
# MAIN ################################################################
def main():
CORRECTUSAGE = 'python get_urls.py 1024 [nonthreaded OR threaded] yahoo.com slashdot.org apple.com kittyandbear.net more-urls.example.com'
if len( sys.argv ) < 4:
print 'ERROR: incorrect number of arguments, correct usage: %s' % CORRECTUSAGE
sys.exit( 1 )
sys.argv.reverse() # pop removes the last element so we need the reverse order
app_name = sys.argv.pop()
bytes_to_download = sys.argv.pop()
try:
bytes_to_download = int( bytes_to_download )
except ValueError as e:
print 'ERROR: bytes_to_download (%s) must be an integer' % bytes_to_download
sys.exit( 1 )
threading_option = sys.argv.pop()
urls = convert_dns_name_to_url( sys.argv )
start_time = time.time()
if threading_option == 'threaded':
threaded( urls , bytes_to_download )
elif threading_option == 'multiprocess':
multiprocess( urls , bytes_to_download )
else:
non_threaded( urls , bytes_to_download )
print 'Elapsed Time: %s' % ( time.time() - start_time )
if __name__ == "__main__":
main()