Crawl links on any URL. Just type in the URL and submit... The code extracts all contents from all links on the page and stores them in a text file. Very useful for those considering a fast search mechanism for their site, or those building their own Spider(Crawler)... Please send comments to mynelis@msn.com
Download
View Source (print view)
Rating : 4.7
Views : 504
<?
##-------------------------------------------------------
# About the script
# -----------------------------------------------
# Code written by Cornelius Duhadzi.
# Senior Web Developer - Datawebghana, Ghana.
# This script is a spider or crawler that reads all
# links on the page provided page. I am developing
# this script as part of a bigger search engine
# project. Any comment, contribution or collaboration
# is welcome.
# --------------------------------------------------
# Tel : +233.27.743.78.78
# E-Mail : mynelis@msn.com
# --------------------------------------------------
# create a folder called archive in the same
# directory. Collection wiil be saved here.
##----------------------------------------------------------
if(!isset($_GET['url'])){
echo "<form action='".$_SERVER['PHP_SELF']."' method='get'>";
echo "URL: <input type='text' name='url' size=40>";
echo "<input type='submit' value='Crawl'>";
}
if(isset($_GET['url'])){
class spider{
function crawl($url){
//initialize variables
$num_links = 5; //number of links on the page you want to crawl
$xdes = '';
$xkey = '';
ini_set('max_execution_time',120);
error_reporting(E_ERROR | E_WARNING | E_PARSE);
//read all links and store in array
$server = explode('/',$url);
$site = $server[2];
$site = $server[2].'/'.$server[3]; //remove this line before you upload
echo "Indexing <b>$site.</b> Please wait...<hr noshade size=1 width=100%>";
flush();
//open and read url
if(@fopen($url,'r')){
$follow = file_get_contents($url);
$links = '';
//retrieve new links
if(@preg_match_all("/(href=)(.*?)(>)(.*?)(</a>)/i",$follow,$href)){
for($a=0;$a<$num_links;$a++){
$link[$a] = $href[2][$a];
//remove all quotes from link
$no = array(''','"');
$yes = array('','');
$link[$a] = str_replace($no,$yes,$link[$a]);
//strip out dynamic links
if(eregi('?',$link[$a])){
$link[$a] = explode('?'.$link[$a]);
$link[$a] = $link[$a][0]; echo $link[$a];
}
//make all urls absolute
$use_protocol = array('http://','www'); //modify this line to include any domain name extension(.com,.net,...)
for($p=0;$p<count($use_protocol);$p++){
if(eregi($use_protocol[$p],$link[$a])){
$return_url = $link[$a];
}
else{
$return_url = "http://$site/$link[$a]";
}
}
$links .= ",$return_url"; //put all links together separated by commas
}
}
}
//open each link, read metatags, titles and content and add to archive
$crawl_link = explode(",",$links);
if(@fopen($url,"r")){
for($r=0;$r<count($crawl_link);$r++){
if(@fopen($crawl_link[$r],"r")){
echo "Current page: <i>{$crawl_link[$r]}</i>.<br>";
flush();
$page = file_get_contents($crawl_link[$r]);
if(@get_meta_tags($crawl_link[$r])){
$meta = get_meta_tags($crawl_link[$r]);
$xdes = $meta['description'];
$xkey = $meta['keywords'];
}
//retrieve page title
if(preg_match("/<title>(.*?)</title>/i",$page,$title)){
$title = $title[1];
}
//retrieve page contents, strip html, javascript, whitespace, etc
$body = strip_tags($page);
//save collection
$DL = ":|:";
$all = $site.$DL.$crawl_link[$r].$DL.$xdes.$DL.$xkey.$DL.$title.$DL.$body;
$ext = ".txt";
$dir = "archive/";
$path = $dir.md5($crawl_link[$r]).$ext;
$fd = fopen($path,"w+");
fwrite($fd,$all);
fclose($fd);
}
}
echo "<hr noshade size=1 width=100%><b>$site</b> indexed successfully.<p>";
echo "<a href='".$_SERVER['PHP_SELF']."'>Index another</a><br>";
}
else{
die("Unable to locate <b>$url</b><p><a href='".$_SERVER['PHP_SELF']."'>Index another</a><br>");
}
$this->url = $url;
$this->xdes = $xdes;
$this->xkey = $xkey;
$this->title = $title;
$this->links = $links;
$this->body = $page;
}
}
//calling the function
$arch = new spider;
$arch->crawl($_GET['url']);
}
?>
download, regirstration form, php_snips, fedex, call php', userstatusinonline, php_script"_class="neww"_target="_blank"_title="im_neuen, best ide, php script" class="neww" target="_b..., forum, p t test paired, guest, php_tools, href php function, utfraw, php jobs, php script and char(124) (select cast(count(1) as varchar(8000)) char(124) from [sysobjects] where 1=1)>0 and =, commit, gonzo_florin, yahoo status check, rss