Home Tools

How to create a Web Crawler using PHP

The web crawler is a computer program which used to collect/crawling the following key values(HREF links, Image links, Metadata.etc) from a given website URL. It is designed like intelligent to follow different HREF links which are already fetched from the previous URL, so in this way, Crawler can jump from one website to other websites. Usually, it called a Web spider or Web Bot. This mechanism always acts as the backbone of the Web search engine.
config.php
1<?php
2//---- Href Element Syntax Start ----
3$hrefTag = array();
4$hrefTag[0] = "<a";
5$hrefTag[1] = "href";
6$hrefTag[2] = "=";
7$hrefTag[3] = ">";
8$hrefTagCountStart = 0;
9$hrefTagCountFinal = count($hrefTag);
10$hrefTagLengthStart = 0;
11$hrefTagLengthFinal = strlen($hrefTag[0]);
12$hrefTagPointer =& $hrefTag[0];
13//---- Href Element Syntax End ----
14
15//---- Image Element Syntax Start ----
16$imgTag = array();
17$imgTag[0] = "<img";
18$imgTag[1] = "src";
19$imgTag[2] = "=";
20$imgTag[3] = ">";
21$imgTagCountStart = 0;
22$imgTagCountFinal = count($imgTag);
23$imgTagLengthStart = 0;
24$imgTagLengthFinal = strlen($imgTag[0]);
25$imgTagPointer =& $imgTag[0];
26//---- Image Element Syntax End ----
27
28//---- Valid Domain Start -----
29$Url_Extensions = array("asp","aspx","html","htm","php","php3","biz","com","edu","gov","info","int","jobs",
30"net","org","in","us","uk");
31//---- Valid Domain End -------
32
33//---- Valid File Extension Start ----
34$Document_Extensions = array("doc","pdf","ppt","txt");
35$Image_Extensions = array("gif","jpeg","jpg","png");
36//---- Valid File Extension End ------
37
38//---- Curl Parameters Start ----
39$crawlOptions = array(
40CURLOPT_RETURNTRANSFER => true,     		// return web page
41CURLOPT_HEADER         => false,    		// don't return headers
42CURLOPT_FOLLOWLOCATION => true,     		// follow redirects
43CURLOPT_ENCODING       => "",       		// handle all encodings
44CURLOPT_USERAGENT      => "AlgoberryBot",	// who am i
45CURLOPT_AUTOREFERER    => true,     		// set referer on redirect
46CURLOPT_CONNECTTIMEOUT => 120,      		// timeout on connect
47CURLOPT_TIMEOUT        => 120,      		// timeout on response
48CURLOPT_MAXREDIRS      => 0,       			// stop after 10 redirects
49);
50//---- Curl Parameters End ------
51?>
crawler.php
1<?php
2class webCrawler
3{
4	public $siteURL;
5	public $error;
6	
7	function __construct()
8	{
9		$this->siteURL = "";
10		$this->error = "";
11	}
12
13	function parser()	
14	{
15		global $hrefTag,$hrefTagCountStart,$hrefTagCountFinal,$hrefTagLengthStart,$hrefTagLengthFinal,$hrefTagPointer;
16		global $imgTag,$imgTagCountStart,$imgTagCountFinal,$imgTagLengthStart,$imgTagLengthFinal,$imgTagPointer;
17		global $Url_Extensions,$Document_Extensions,$Image_Extensions,$crawlOptions;
18					
19		$dotCount = 0;
20		$slashCount = 0;
21		$singleSlashCount = 0;
22		$doubleSlashCount = 0;
23		$parentDirectoryCount = 0;
24
25		$linkBuffer = array();
26		
27		if($url = trim$this->siteURL != "")
28		{
29			$crawlURL = rtrim($url,"/");
30			if($directoryURL = dirname$crawlURL == "http:")
31			{	$directoryURL = $crawlURL;	}
32			$urlParser = preg_split("/\//",$crawlURL);
33		
34			//-- Curl Start --
35			$curlObject = curl_init($crawlURL);
36			curl_setopt_array($curlObject,$crawlOptions);
37			$webPageContent = curl_exec($curlObject);
38			$errorNumber = curl_errno($curlObject);
39			curl_close($curlObject);
40			//-- Curl End --
41		
42			if($errorNumber == 0)
43			{
44				$webPageCounter = 0;
45				$webPageLength = strlen($webPageContent);
46				while($webPageCounter < $webPageLength)
47				{
48					$character = $webPageContent[$webPageCounter];
49					if($character == "")
50					{	
51						$webPageCounter++;	
52						continue;
53					}
54					$character = strtolower($character);
55					//-- Href Filter Start --
56					if($hrefTagPointer[$hrefTagLengthStart] == $character)
57					{
58						$hrefTagLengthStart++;
59						if($hrefTagLengthStart == $hrefTagLengthFinal)
60						{
61							$hrefTagCountStart++;
62							if($hrefTagCountStart == $hrefTagCountFinal)
63							{
64								if($hrefURL != "")
65								{
66									if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
67									{
68										if($doubleSlashCount >= 1)
69										{	$hrefURL = "http://".$hrefURL;	}
70										else if($parentDirectoryCount >= 1)
71										{
72											$tempData = 0;
73											$tempString = "";
74											$tempTotal = count($urlParser) - $parentDirectoryCount;
75											while($tempData < $tempTotal)
76											{
77												$tempString .= $urlParser[$tempData]."/";
78												$tempData++;
79											}
80											$hrefURL = $tempString."".$hrefURL;
81										}
82										else if($singleSlashCount >= 1)
83										{	$hrefURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$hrefURL;	}
84									}
85									$host = "";
86									$hrefURL = urldecode($hrefURL);
87									$hrefURL = rtrim($hrefURL,"/");
88									if(filter_var$hrefURL,FILTER_VALIDATE_URL == true)
89									{	
90										$dump = parse_url($hrefURL);
91										if(isset$dump["host"])
92										{	$host = trim(strtolower$dump["host"]);	}
93									}
94									else
95									{
96										$hrefURL = $directoryURL."/".$hrefURL;
97										if(filter_var$hrefURL,FILTER_VALIDATE_URL == true)
98										{	
99											$dump = parse_url($hrefURL);	
100											if(isset$dump["host"])
101											{	$host = trim(strtolower$dump["host"]);	}
102										}
103									}
104									if($host != "")
105									{
106										$extension = pathinfo($hrefURL,PATHINFO_EXTENSION);
107										if($extension != "")
108										{
109											$tempBuffer ="";
110											$extensionlength = strlen($extension);
111											for($tempData = 0; $tempData < $extensionlength; $tempData++)
112											{
113												if($extension[$tempData] != "?")
114												{	
115													$tempBuffer = $tempBuffer.$extension[$tempData];
116													continue;
117												}
118												else
119												{
120													$extension = trim($tempBuffer);
121													break;
122												}
123											}
124											if(in_array$extension,$Url_Extensions)
125											{	$type = "domain";	}
126											else if(in_array$extension,$Image_Extensions)
127											{	$type = "image";	}
128											else if(in_array$extension,$Document_Extensions)
129											{	$type = "document";	}
130											else
131											{	$type = "unknown";	}
132										}
133										else
134										{	$type = "domain";	}
135									
136										if($hrefURL != "")
137										{
138											if($type == "domain" && !in_array$hrefURL,$this->linkBuffer["domain"])
139											{	$this->linkBuffer["domain"][] = $hrefURL;	}
140											if($type == "image" && !in_array$hrefURL,$this->linkBuffer["image"])
141											{	$this->linkBuffer["image"][] = $hrefURL;	}
142											if($type == "document" && !in_array$hrefURL,$this->linkBuffer["document"])
143											{	$this->linkBuffer["document"][] = $hrefURL;	}
144											if($type == "unknown" && !in_array$hrefURL,$this->linkBuffer["unknown"])
145											{	$this->linkBuffer["unknown"][] = $hrefURL;	}
146										}
147									}
148								}
149								$hrefTagCountStart = 0;
150							}
151							if($hrefTagCountStart == 3)
152							{
153								$hrefURL = "";
154								$dotCount = 0;
155								$slashCount = 0;
156								$singleSlashCount = 0;
157								$doubleSlashCount = 0;
158								$parentDirectoryCount = 0;
159								$webPageCounter++;
160								while($webPageCounter < $webPageLength)
161								{
162									$character = $webPageContent[$webPageCounter];
163									if($character == "")
164									{	
165										$webPageCounter++;	
166										continue;
167									}
168									if($character == "\"" || $character == "'")
169									{
170										$webPageCounter++;
171										while($webPageCounter < $webPageLength)
172										{
173											$character = $webPageContent[$webPageCounter];
174											if($character == "")
175											{	
176												$webPageCounter++;	
177												continue;
178											}
179											if($character == "\"" || $character == "'" || $character == "#")
180											{	
181												$webPageCounter--;	
182												break;	
183											}
184											else if($hrefURL != "")
185											{	$hrefURL .= $character;	}
186											else if($character == "." || $character == "/")
187											{
188												if($character == ".")
189												{
190													$dotCount++;
191													$slashCount = 0;
192												}
193												else if($character == "/")
194												{
195													$slashCount++;
196													if($dotCount == 2 && $slashCount == 1)
197													$parentDirectoryCount++;
198													else if($dotCount == 0 && $slashCount == 1)
199													$singleSlashCount++;
200													else if($dotCount == 0 && $slashCount == 2)
201													$doubleSlashCount++;
202													$dotCount = 0;
203												}
204											}
205											else
206											{	$hrefURL .= $character;	}
207											$webPageCounter++;
208										}
209										break;
210									}
211									$webPageCounter++;
212								}
213							}
214							$hrefTagLengthStart = 0;
215							$hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
216							$hrefTagPointer =& $hrefTag[$hrefTagCountStart];
217						}
218					}
219					else
220					{	$hrefTagLengthStart = 0;	}
221					//-- Href Filter End --
222					//-- Image Filter Start --
223					if($imgTagPointer[$imgTagLengthStart] == $character)
224					{
225						$imgTagLengthStart++;
226						if($imgTagLengthStart == $imgTagLengthFinal)
227						{
228							$imgTagCountStart++;
229							if($imgTagCountStart == $imgTagCountFinal)
230							{
231								if($imgURL != "")
232								{
233									if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
234									{
235										if($doubleSlashCount >= 1)
236										{	$imgURL = "http://".$imgURL;	}
237										else if($parentDirectoryCount >= 1)
238										{
239											$tempData = 0;
240											$tempString = "";
241											$tempTotal = count($urlParser) - $parentDirectoryCount;
242											while($tempData < $tempTotal)
243											{
244												$tempString .= $urlParser[$tempData]."/";
245												$tempData++;
246											}
247											$imgURL = $tempString."".$imgURL;
248										}
249										else if($singleSlashCount >= 1)
250										{	$imgURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$imgURL;	}
251									}
252									$host = "";
253									$imgURL = urldecode($imgURL);
254									$imgURL = rtrim($imgURL,"/");
255									if(filter_var$imgURL,FILTER_VALIDATE_URL == true)
256									{	
257										$dump = parse_url($imgURL);	
258										$host = trim(strtolower$dump["host"]);
259									}
260									else
261									{
262										$imgURL = $directoryURL."/".$imgURL;
263										if(filter_var$imgURL,FILTER_VALIDATE_URL == true)
264										{	
265											$dump = parse_url($imgURL);	
266											$host = trim(strtolower$dump["host"]);
267										}	
268									}
269									if($host != "")
270									{
271										$extension = pathinfo($imgURL,PATHINFO_EXTENSION);
272										if($extension != "")
273										{
274											$tempBuffer ="";
275											$extensionlength = strlen($extension);
276											for($tempData = 0; $tempData < $extensionlength; $tempData++)
277											{
278												if($extension[$tempData] != "?")
279												{	
280													$tempBuffer = $tempBuffer.$extension[$tempData];
281													continue;
282												}
283												else
284												{
285													$extension = trim($tempBuffer);
286													break;
287												}
288											}
289											if(in_array$extension,$Url_Extensions)
290											{	$type = "domain";	}
291											else if(in_array$extension,$Image_Extensions)
292											{	$type = "image";	}
293											else if(in_array$extension,$Document_Extensions)
294											{	$type = "document";	}
295											else
296											{	$type = "unknown";	}
297										}
298										else
299										{	$type = "domain";	}
300									
301										if($imgURL != "")
302										{
303											if($type == "domain" && !in_array$imgURL,$this->linkBuffer["domain"])
304											{	$this->linkBuffer["domain"][] = $imgURL;	}
305											if($type == "image" && !in_array$imgURL,$this->linkBuffer["image"])
306											{	$this->linkBuffer["image"][] = $imgURL;	}
307											if($type == "document" && !in_array$imgURL,$this->linkBuffer["document"])
308											{	$this->linkBuffer["document"][] = $imgURL;	}
309											if($type == "unknown" && !in_array$imgURL,$this->linkBuffer["unknown"])
310											{	$this->linkBuffer["unknown"][] = $imgURL;	}
311										}
312									}
313								}
314								$imgTagCountStart = 0;
315							}
316							if($imgTagCountStart == 3)
317							{
318								$imgURL = "";
319								$dotCount = 0;
320								$slashCount = 0;
321								$singleSlashCount = 0;
322								$doubleSlashCount = 0;
323								$parentDirectoryCount = 0;
324								$webPageCounter++;
325								while($webPageCounter < $webPageLength)
326								{
327									$character = $webPageContent[$webPageCounter];
328									if($character == "")
329									{	
330										$webPageCounter++;	
331										continue;
332									}
333									if($character == "\"" || $character == "'")
334									{
335										$webPageCounter++;
336										while($webPageCounter < $webPageLength)
337										{
338											$character = $webPageContent[$webPageCounter];
339											if($character == "")
340											{	
341												$webPageCounter++;	
342												continue;
343											}
344											if($character == "\"" || $character == "'" || $character == "#")
345											{	
346												$webPageCounter--;	
347												break;	
348											}
349											else if($imgURL != "")
350											{	$imgURL .= $character;	}
351											else if($character == "." || $character == "/")
352											{
353												if($character == ".")
354												{
355													$dotCount++;
356													$slashCount = 0;
357												}
358												else if($character == "/")
359												{
360													$slashCount++;
361													if($dotCount == 2 && $slashCount == 1)
362													$parentDirectoryCount++;
363													else if($dotCount == 0 && $slashCount == 1)
364													$singleSlashCount++;
365													else if($dotCount == 0 && $slashCount == 2)
366													$doubleSlashCount++;
367													$dotCount = 0;
368												}
369											}
370											else
371											{	$imgURL .= $character;	}
372											$webPageCounter++;
373										}
374										break;
375									}
376									$webPageCounter++;
377								}
378							}
379							$imgTagLengthStart = 0;
380							$imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
381							$imgTagPointer =& $imgTag[$imgTagCountStart];
382						}
383					}
384					else
385					{	$imgTagLengthStart = 0;	}
386					//-- Image Filter End --
387					$webPageCounter++;
388				}
389			}
390			else
391			{	$this->error = "Unable to proceed, permission denied";	}
392		}
393		else
394		{	$this->error = "Please enter url";	}
395	
396		if($this->error != "")
397		{	$this->linkBuffer["error"] = $this->error;	}
398		
399		return $this->linkBuffer;
400	}	
401}
402?>
index.php
1<?php
2include_once("config.php");
3include_once("crawler.php");
4?>
5<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
6<html>
7<head>
8	<title>Web Crawler</title>
9	<style>
10	.textbox
11	{ 
12		border: medium none;
13		font-family: Arial,Sans-Serif;
14		font-size: 16px;
15		height:28px;
16		line-height: 24px;
17		width: 578px;
18		border: 1px solid #ACBABD;
19		padding:5px;
20	}
21	.submitbox 
22	{	
23		height:40px; 
24		width:80px; 
25		text-align:center;
26	} 
27	</style>
28</head>
29<body>
30	<div style="padding:10px;">
31		<div>
32			<form name="crawlsearch" method="post" action="index.php">
33			<table>
34			<tr>
35				<td align="center" colspan="2"><b>Web Crawl</b></td>
36			</tr>
37			<tr>
38				<td>
39					<input class="textbox" type="text" placeholder="Enter URL" name="url" value="<?php if(isset($_POST["url"])) { echo $_POST["url"]; } ?>">
40				</td>
41				<td>
42					<input class="submitbox" type="submit" name= "SubmitBox" value="Crawl">
43				</td>
44			</tr>
45			</table>
46			</form>
47		</div>
48	</div>
49	<div style="padding:10px;">
50	<?php
51	if(isset$_POST["SubmitBox"])
52	{
53		$obj = new webCrawler();
54		$obj->siteURL = $_POST["url"];	
55		$returnData = $obj->parser();
56		echo "<pre>";	
57		print_r($returnData);
58		echo "</pre>";
59	}
60	?>
61	</div>
62</body>
63<html>