john pfeiffer
  • Home
  • Categories
  • Tags
  • Archives

php function to extract links into array

<html>
<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">

target url:&nbsp;<input type="text" name="target_url"  maxlength="256" />
<input type="Submit" value="Submit" />
</form>

<?php
//john pfeiffer 2009-11

//php-function-to-extract-links-into-array
//preg_match_all("/<\s*[aA]\s+[hH][rR][eE][fF]\s*=\s/*",$result,$matches);

//apparently google use the "a" for anchor, put a bunch of class stuff, and then
//the link and then close the anchor... weird.
//additionally they define it as with a ' or a "


if( !isset($_POST['target_url']) || empty($_POST['target_url'] )    )
{
    echo "<b>You must fill in the target url.</b>";
}
else
{
    $target_url = strtolower($_POST['target_url']);
    $result = file_get_contents( $target_url );

    if($result == NULL)
    {   echo "Error: could not get page";   }
    else
    {
        preg_match_all(

'/<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*([\"\']+)(.*)<//[aA]>*)/',

//start of search expression, then <, then a or A, then at least one space
//then continue getting characters until a >, then an href or HREF variation,
//then zero or more spaces, =, zero or more spaces, then at least either " or '
//then continue getting characters until a > (aka get while not a >)
//finally does it match </ a or A>



//'/<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*([\"\']+)([^\"]|[^\'])*([\"\']+)[^>]*>/',

//|(<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*\'[^\']*\'[^>]*>)

//'/<a[\s]+[^>]*href\s*=\s*([\"\']+)([^>]+?)(\1|>)/i',

//[^<]*<\s*\/a\s*>/',

//<[aA]\s+[hH][rR][eE][fF]\s*=\s*\"(.*)\"\s*>(.*)<\s*\/a\s*>

//<[aA]\s+([hH][rR][eE][fF])\s*=\s*(\"(.*)\")|(\'(.*)\')/',

//(.*)(\'(.*)\')| [^>]*


        $result,$matches);

        //begins with a "<a" and then space(s) then href then space?=space?
        //then an escaped quote - the (.*) means any number of any chars

        echo"<pre>";
        echo sizeof($matches[0]). "\n";
        //echo htmlspecialchars($matches[0][0]);


        foreach ($matches[0] as $key => $value)
        {
            echo htmlspecialchars($value) . "\n<br />";
        }

        //print_r( $matches );
        echo "</pre>";
    }//end else error webpage not found
}//end else webpage form not filled in

?>

</html>



<?php

// the popular internet REGEX has some holes for poorly formed expressions
//  preg_match_all('/<a[\s]+[^>]*href\s*=\s*([\"\']+)([^>]+?)(\1|>)/i', $html, $m);




?>

  • « php functions
  • php for loops arrays »

Published

Feb 6, 2010

Category

php

~229 words

Tags

  • array 16
  • extract 6
  • function 14
  • into 3
  • links 4
  • php 82
  • to 63