PhpSnip.com

User Stats

Mr Ananse Search Engine Crawler

Crawl links on any URL. Just type in the URL and submit... The code extracts all contents from all links on the page and stores them in a text file. Very useful for those considering a fast search mechanism for their site, or those building their own Spider(Crawler)... Please send comments to mynelis@msn.com

Info

 Download  View Source (print view)
 Rating : 4.7  Views : 504

Source Code ( 128 lines )

<? 
##-------------------------------------------------------
#	About the script										
#	-----------------------------------------------
#	Code written by Cornelius Duhadzi.	
#	Senior Web Developer - Datawebghana, Ghana.			
#	This script is a spider or crawler that reads all	
#	links on the page provided	page. I am developing	
#	this script as part of a bigger search engine		
#	project. Any comment, contribution or collaboration	
#  	is welcome.								
#	-------------------------------------------------- 
#	Tel	:	+233.27.743.78.78						
#	E-Mail	:	mynelis@msn.com							
#	--------------------------------------------------	
#	create a folder called archive in the same 			
#	directory. Collection wiil be saved here.			
##----------------------------------------------------------
if(!isset($_GET['url'])){
 echo "<form action='".$_SERVER['PHP_SELF']."' method='get'>";
 echo "URL: <input type='text' name='url' size=40>";
 echo "<input type='submit' value='Crawl'>";
}
if(isset($_GET['url'])){
 class spider{ 
  function crawl($url){  
   //initialize variables
   $num_links	=	5;	//number of links on the page you want to crawl
   $xdes		=	'';
   $xkey		=	'';   
   ini_set('max_execution_time',120);
   error_reporting(E_ERROR | E_WARNING | E_PARSE);
   
   //read all links and store in array
   $server 	= explode('/',$url);
   $site 	= $server[2];
   $site 	= $server[2].'/'.$server[3]; //remove this line before you upload
   echo "Indexing <b>$site.</b> Please wait...<hr noshade size=1 width=100%>";
   flush();
   
   //open  and read url
   if(@fopen($url,'r')){
    $follow = file_get_contents($url);
	$links 	= '';
	
	 //retrieve new links
	 if(@preg_match_all("/(href=)(.*?)(>)(.*?)(</a>)/i",$follow,$href)){
	  for($a=0;$a<$num_links;$a++){
	   $link[$a] 	=	$href[2][$a];
	   
	   //remove all quotes from link
	   $no	= array(''','"');
	   $yes	= array('','');
	   $link[$a]	=	str_replace($no,$yes,$link[$a]);
	   
	   //strip out dynamic links
   	   if(eregi('?',$link[$a])){
        $link[$a] = explode('?'.$link[$a]);
	    $link[$a] = $link[$a][0];	echo $link[$a];
       }
	   
	   //make all urls absolute
	   $use_protocol = array('http://','www'); //modify this line to include any domain name extension(.com,.net,...)
   	   for($p=0;$p<count($use_protocol);$p++){
    	if(eregi($use_protocol[$p],$link[$a])){
	 	 $return_url = $link[$a];
		}
	    else{
	 	 $return_url = "http://$site/$link[$a]";
	    }	
       }
	   $links .= ",$return_url"; //put all links together separated by commas
	  }
	 }
   }
   
   //open each link, read metatags, titles and content and add to archive
   $crawl_link = explode(",",$links);
   if(@fopen($url,"r")){
    for($r=0;$r<count($crawl_link);$r++){
	 if(@fopen($crawl_link[$r],"r")){
	  echo "Current page: <i>{$crawl_link[$r]}</i>.<br>";
	  flush();
	  $page	=	file_get_contents($crawl_link[$r]);
	  if(@get_meta_tags($crawl_link[$r])){
	   $meta	=	get_meta_tags($crawl_link[$r]);
	   $xdes	=	$meta['description'];
	   $xkey	=	$meta['keywords'];	   
	  }
	  //retrieve page title
	  if(preg_match("/<title>(.*?)</title>/i",$page,$title)){
	   $title	=	$title[1];
	  }
	  
	  //retrieve page contents, strip html, javascript, whitespace, etc
	  $body	=	strip_tags($page);
	  
	  //save collection
	  $DL	=	":|:";
	  $all	=	$site.$DL.$crawl_link[$r].$DL.$xdes.$DL.$xkey.$DL.$title.$DL.$body;
	  $ext	=	".txt";
	  $dir	=	"archive/";
	  $path	=	$dir.md5($crawl_link[$r]).$ext;
	  $fd	=	fopen($path,"w+");
	  fwrite($fd,$all);
	  fclose($fd);
	 }
	}	
	echo "<hr noshade size=1 width=100%><b>$site</b> indexed successfully.<p>";
	echo "<a href='".$_SERVER['PHP_SELF']."'>Index another</a><br>";
   }
   else{
    die("Unable to locate <b>$url</b><p><a href='".$_SERVER['PHP_SELF']."'>Index another</a><br>");
   }
   $this->url	=	$url;
   $this->xdes	=	$xdes;
   $this->xkey	=	$xkey;
   $this->title	=	$title;
   $this->links	=	$links;
   $this->body	=	$page;
  }
 }
 
//calling the function 
 $arch	=	new spider;
 $arch->crawl($_GET['url']);
}
?>

Search

Subscribe

  Rss Feeds

Sponsors

Advertise