john pfeiffer
  • Home
  • Categories
  • Tags
  • Archives

http scrape oxygen weblink

# 2013-02-07 johnpfeiffer very fragile page scraper to download oxygen weblinks
import os
import sys
import requests     # have the requests directory right next to this script

URL_IDENTIFIER = '<meta HTTP-EQUIV="REFRESH" content="0; url='
FILENAME_IDENTIFIER = '<td id="contentTitle" style="overflow:hidden; width:700px;"><b>'


def find_substring( start_identifier , end_identifier ,  data ):
    found = None
    index_start = data.index( start_identifier )
    fragment = data[ index_start + len( start_identifier ): ]   # remove everything before and including the identifier
    found = fragment[  : fragment.index( end_identifier )  ]    # remove everything after and including the first quotation mark
    return found


# MAIN #####


def main():
    print "arguments passed in: %s \n" % ( str ( sys.argv ) )
    print len( sys.argv )
    if len( sys.argv ) != 2:
        print 'WRONG number of arguments, correct usage: python download-weblink.py "https://long-weblink-string"  (quotes are needed!)'
    else:
        response = requests.get( sys.argv[1] )
        if response.status_code == 200:
#           print 'DEBUG: %s \n' % response.text
            url = find_substring( URL_IDENTIFIER , '"' , response.text )
            remote_filename = find_substring( FILENAME_IDENTIFIER , '<' , response.text )
            print '%s is at %s \n' % ( remote_filename , url )
            if not url or not remote_filename:
                print 'ERROR: unable to find the URL or filename \n'
            else:
                remote_file = requests.get( url )
                destination_file_path = os.path.join( os.getcwd() , remote_filename )
                destination_file = open( destination_file_path , 'wb' )
                destination_file.write( remote_file.content )
                destination_file.close()

# DEBUG TEXT FILE CONTENT
#                f = open( destination_file_path )
#                print 'DEBUG: %s \n' % f.read()
#                f.close()

        else:
            print 'ERROR: did not receive a 200 response code'


if __name__ == "__main__":
    main()

  • « nirvanix connection rest requests
  • Centos5 3 minimal install »

Published

Feb 8, 2013

Category

python

~190 words

Tags

  • http 12
  • oxygen 14
  • python 180
  • scrape 3
  • weblink 1