john pfeiffer
  • Home
  • Categories
  • Tags
  • Archives

http get html scrape parse download

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.URL;
import javax.net.ssl.HttpsURLConnection;

public class getoxygenweblinkdownload
{
  private static final String CLASSVERSION = "0.13";
  private static final String CORRECTUSAGE = "USAGE: java -jar getoxygenweblinkdownload-0.13 \"https://weblinkhere\" (quotes needed!)";
  private static final String URLIDENTIFIER = "url=";
  private static final String URLIDENTIFIERTERMINATOR = "\">";
  private static final String FILENAMELINEIDENTIFIER = "contentTitle";
  private static final String FILENAMEIDENTIFIER = "<b>";
  private static final String FILENAMEIDENTIFIERTERMINATOR = "</b>";
  private static String NEWLINE = System.getProperty("line.separator");
  private String targetUrlString;
  private String targetFilename;
  StringBuilder result;

  protected static String getVersion()
  {
    return "0.13";
  }

  public static void main(String[] args)
    throws Exception
  {
    long start = System.currentTimeMillis();
    getoxygenweblinkdownload main = new getoxygenweblinkdownload();
    main.result = new StringBuilder();
    if (args.length != 1)
    {
      System.err.println("Incorrect number of parameters, USAGE: java -jar getoxygenweblinkdownload-0.13 \"https://weblinkhere\" (quotes needed!)");
      System.exit(1);
    }
    main.targetUrlString = args[0];

    main.extractDataFromRedirectUrl();
    if (main.result.toString().isEmpty())
    {
      URL target = new URL(main.targetUrlString);
      System.out.print("DEBUG: starting download of " + main.targetFilename + " from " + main.targetUrlString + NEWLINE);
      main.download(target, main.targetFilename);
      main.outputResult();
      System.out.print("DEBUG: application finished in " + (System.currentTimeMillis() - start) + " ms");
    }
  }

  private void outputResult()
  {
    System.out.println(this.result);
  }

  private void extractDataFromRedirectUrl()
  {
    URL target = null;
    try
    {
      target = new URL(this.targetUrlString);

      HttpsURLConnection httpsConnection = (HttpsURLConnection)target.openConnection();
      if (httpsConnection != null)
      {
        System.out.print("DEBUG: connecting to " + this.targetUrlString + NEWLINE);
        InputStream response = httpsConnection.getInputStream();
        BufferedReader br = new BufferedReader(new InputStreamReader(response));
        String line;
        while ((line = br.readLine()) != null)
        {
          String line;
          if (line.contains("url="))
          {
            this.targetUrlString = extractMetaUrl(line);
          }
          if (!line.contains("contentTitle"))
            continue;
          this.targetFilename = extractFilename(line);
        }

        br.close();
      }
    }
    catch (MalformedURLException e)
    {
      this.result.append("ERROR: malformed url: " + e.getMessage());
    }
    catch (IOException e)
    {
      this.result.append("ERROR: IOException: " + e.getMessage());
    }
  }

  private static String extractMetaUrl(String line)
  {
    int start = line.indexOf("url=") + "url=".length();
    int end = line.indexOf("\">");
    return line.substring(start, end);
  }

  private static String extractFilename(String line)
  {
    int start = line.indexOf("<b>") + "<b>".length();
    int end = line.indexOf("</b>");
    return line.substring(start, end);
  }

  private void download(URL url, String outputFileName) throws IOException
  {
    url.openConnection();
    InputStream inputStream = url.openStream();
    int chunkSize = 524288;
    int bytesRead = 0;
    byte[] buffer = new byte[chunkSize];
    File outputFile = new File(System.getProperty("user.dir") + System.getProperty("file.separator") + outputFileName);
    FileOutputStream writer = new FileOutputStream(outputFile);
    while ((bytesRead = inputStream.read(buffer)) > 0)
    {
      writer.write(buffer, 0, bytesRead);
      buffer = new byte[chunkSize];
    }
    writer.close();
    inputStream.close();
  }
}

  • « eclipse shortcuts preferences customize syntax text color run as commandline
  • rpsls pytest simple test »

Published

Jan 17, 2013

Category

java

~288 words

Tags

  • download 12
  • get 22
  • html 23
  • http 12
  • java 252
  • parse 5
  • scrape 3