# 2013-02-07 johnpfeiffer very fragile page scraper to download oxygen weblinks
import os
import sys
import requests # have the requests directory right next to this script
URL_IDENTIFIER = '<meta HTTP-EQUIV="REFRESH" content="0; url='
FILENAME_IDENTIFIER = '<td id="contentTitle" style="overflow:hidden; width:700px;"><b>'
def find_substring( start_identifier , end_identifier , data ):
found = None
index_start = data.index( start_identifier )
fragment = data[ index_start + len( start_identifier ): ] # remove everything before and including the identifier
found = fragment[ : fragment.index( end_identifier ) ] # remove everything after and including the first quotation mark
return found
# MAIN #####
def main():
print "arguments passed in: %s \n" % ( str ( sys.argv ) )
print len( sys.argv )
if len( sys.argv ) != 2:
print 'WRONG number of arguments, correct usage: python download-weblink.py "https://long-weblink-string" (quotes are needed!)'
else:
response = requests.get( sys.argv[1] )
if response.status_code == 200:
# print 'DEBUG: %s \n' % response.text
url = find_substring( URL_IDENTIFIER , '"' , response.text )
remote_filename = find_substring( FILENAME_IDENTIFIER , '<' , response.text )
print '%s is at %s \n' % ( remote_filename , url )
if not url or not remote_filename:
print 'ERROR: unable to find the URL or filename \n'
else:
remote_file = requests.get( url )
destination_file_path = os.path.join( os.getcwd() , remote_filename )
destination_file = open( destination_file_path , 'wb' )
destination_file.write( remote_file.content )
destination_file.close()
# DEBUG TEXT FILE CONTENT
# f = open( destination_file_path )
# print 'DEBUG: %s \n' % f.read()
# f.close()
else:
print 'ERROR: did not receive a 200 response code'
if __name__ == "__main__":
main()