<html>
<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">
target url: <input type="text" name="target_url" maxlength="256" />
<input type="Submit" value="Submit" />
</form>
<?php
//john pfeiffer 2009-11
//php-function-to-extract-links-into-array
//preg_match_all("/<\s*[aA]\s+[hH][rR][eE][fF]\s*=\s/*",$result,$matches);
//apparently google use the "a" for anchor, put a bunch of class stuff, and then
//the link and then close the anchor... weird.
//additionally they define it as with a ' or a "
if( !isset($_POST['target_url']) || empty($_POST['target_url'] ) )
{
echo "<b>You must fill in the target url.</b>";
}
else
{
$target_url = strtolower($_POST['target_url']);
$result = file_get_contents( $target_url );
if($result == NULL)
{ echo "Error: could not get page"; }
else
{
preg_match_all(
'/<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*([\"\']+)(.*)<//[aA]>*)/',
//start of search expression, then <, then a or A, then at least one space
//then continue getting characters until a >, then an href or HREF variation,
//then zero or more spaces, =, zero or more spaces, then at least either " or '
//then continue getting characters until a > (aka get while not a >)
//finally does it match </ a or A>
//'/<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*([\"\']+)([^\"]|[^\'])*([\"\']+)[^>]*>/',
//|(<[aA][\s]+[^>]*[hH][rR][eE][fF]\s*=\s*\'[^\']*\'[^>]*>)
//'/<a[\s]+[^>]*href\s*=\s*([\"\']+)([^>]+?)(\1|>)/i',
//[^<]*<\s*\/a\s*>/',
//<[aA]\s+[hH][rR][eE][fF]\s*=\s*\"(.*)\"\s*>(.*)<\s*\/a\s*>
//<[aA]\s+([hH][rR][eE][fF])\s*=\s*(\"(.*)\")|(\'(.*)\')/',
//(.*)(\'(.*)\')| [^>]*
$result,$matches);
//begins with a "<a" and then space(s) then href then space?=space?
//then an escaped quote - the (.*) means any number of any chars
echo"<pre>";
echo sizeof($matches[0]). "\n";
//echo htmlspecialchars($matches[0][0]);
foreach ($matches[0] as $key => $value)
{
echo htmlspecialchars($value) . "\n<br />";
}
//print_r( $matches );
echo "</pre>";
}//end else error webpage not found
}//end else webpage form not filled in
?>
</html>
<?php
// the popular internet REGEX has some holes for poorly formed expressions
// preg_match_all('/<a[\s]+[^>]*href\s*=\s*([\"\']+)([^>]+?)(\1|>)/i', $html, $m);
?>