john pfeiffer
  • Home
  • Categories
  • Tags
  • Archives

requests beautifulsoup html parse scrape link get post library byte to hex

class EmoticonDownloader(object):

    def __init__(self, listing_url='https://www.hipchat.com/emoticons'):
        self.listing_url = listing_url

    def get_emoticon_listing(self):
        """
        :param url: source where emoticon shortcuts and image urls are found
        :return: list of tuples with each shortcut and image url
        """
        emoticon_shortcuts_and_image_urls = list()
        r = requests.get(self.listing_url)
        print('{}: {}'.format(self.listing_url, r.status_code))
        if r.status_code != 200:
            print('ERROR: could not get emoticon list: {}'.format(r.status))
        else:
            # print(r.content)  # DEBUG
            soup = BeautifulSoup(r.content)
            # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
            emoticons_blocks = soup.find_all('div', {'class': 'emoticon-block'})
            print('found {} emoticons'.format(len(emoticons_blocks)))
            for i in emoticons_blocks:
                # print i  # DEBUG
                # print('{} {}'.format(i['data-clipboard-text'], i.contents[1]['src']))  # 0th element is '\n' , DEBUG
                emoticon_shortcuts_and_image_urls.append((i['data-clipboard-text'], i.contents[1]['src']))
        return emoticon_shortcuts_and_image_urls


- - -
import requests
response = requests.get(base + '/targetresource')

TARGET_STRING = '<a href="?somequeryparam='
beginIndex = response.content.find(TARGET_STRING)
print beginIndex
endIndex = htmlResponse.find( '">' , beginIndex)
print endIndex
print htmlResponse[ beginIndex + len(TARGET_STRING) : endIndex ]

- - -
# -*- coding: utf-8 -*-
# Do not remove this (seemingly-unused) import, because it adds external libraries to the import path
import os
import sys
import logging

from webapp2 import WSGIApplication
import routes
from config import config


app_root_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(app_root_dir, 'lib'))

app = WSGIApplication(routes.urls , config = config , debug = True)


- - -
# -*- coding: utf-8 -*-
from webapp2 import Route
import handlers

# Map url's to handlers in the handlers module , optionally choosing specific target method and request type
urls = [
    Route(r'/', handler = 'handlers.IndexHandler', name = 'index', handler_method = 'get' , methods = ['GET']),
    Route(r'/oauthconsumer', handler = 'handlers.OAuthConsumerHandler', name = 'index', handler_method = 'get' , methods = ['GET']),]


# -*- coding: utf-8 -*-
import os
config = dict()
config['template.dir'] = 'templates/'
config['assets.dir'] = 'assets/'
config[ 'webapp2_extras.sessions' ] = {'secret_key': 'my-super-secret-key' }
config['debug'] = {
    'enabled': True,
    'log_level': 'DEBUG'
}


from index_handler import *     # handlers/__init__.py

# -*- coding: utf-8 -*-
import webapp2
import requests

class IndexHandler(webapp2.RequestHandler):
    server = 'http://kittyandbear.net'

    def get(self):
#        self.render("index.html", {})
        response = requests.get(self.server)
        self.response.out.write('GET %s = %s <br />\n' % (self.server , response.status_code))
        self.response.out.write('HEADERS = %s <br />\n' % (response.headers))
        self.response.out.write('%s' % (response.text))


        self.response.out.write('raw data as hex:<br /><br />\n')
        CHUNK = 1024
        response_raw = requests.get(self.server , stream = True)
        response_raw.raw
        raw_data = response_raw.raw.read(CHUNK)
        if raw_data:
            self.response.out.write('raw bytes = %s' % (len(raw_data)))
            self.response.out.write('\n<br /><br />\n')
            self.response.out.write(ByteToHex(str(raw_data)))
        else:
            self.response.out.write('raw data is null')


def ByteToHex(byteStr):
    hex = []
    for aChar in byteStr:
        hex.append("%02X " % ord(aChar))
    return ''.join(hex).strip()

  • « path name directory parts extraction posixpath os path conversion
  • concurrency threading get url parse html output queue »

Published

Jan 21, 2015

Category

python

~280 words

Tags

  • beautifulsoup 1
  • byte 2
  • get 22
  • hex 3
  • html 23
  • library 5
  • link 4
  • parse 5
  • post 12
  • python 180
  • requests 5
  • scrape 3
  • to 63