How to create a Web Crawler using PHP

The web crawler is a computer program which used to collect/crawling the following key values(HREF links, Image links, Metadata.etc) from a given website URL. It is designed like intelligent to follow different HREF links which are already fetched from the previous URL, so in this way, Crawler can jump from one website to other websites. Usually, it called a Web spider or Web Bot. This mechanism always acts as the backbone of the Web search engine.
config.php
1 <?php
2 //---- Href Element Syntax Start ----
3 $hrefTag = array();
4 $hrefTag[0] = "<a";
5 $hrefTag[1] = "href";
6 $hrefTag[2] = "=";
7 $hrefTag[3] = ">";
8 $hrefTagCountStart = 0;
9 $hrefTagCountFinal = count($hrefTag);
10 $hrefTagLengthStart = 0;
11 $hrefTagLengthFinal = strlen($hrefTag[0]);
12 $hrefTagPointer =& $hrefTag[0];
13 //---- Href Element Syntax End ----
14 
15 //---- Image Element Syntax Start ----
16 $imgTag = array();
17 $imgTag[0] = "<img";
18 $imgTag[1] = "src";
19 $imgTag[2] = "=";
20 $imgTag[3] = ">";
21 $imgTagCountStart = 0;
22 $imgTagCountFinal = count($imgTag);
23 $imgTagLengthStart = 0;
24 $imgTagLengthFinal = strlen($imgTag[0]);
25 $imgTagPointer =& $imgTag[0];
26 //---- Image Element Syntax End ----
27 
28 //---- Valid Domain Start -----
29 $Url_Extensions = array("asp","aspx","html","htm","php","php3","biz","com","edu","gov","info","int","jobs",
30 "net","org","in","us","uk");
31 //---- Valid Domain End -------
32 
33 //---- Valid File Extension Start ----
34 $Document_Extensions = array("doc","pdf","ppt","txt");
35 $Image_Extensions = array("gif","jpeg","jpg","png");
36 //---- Valid File Extension End ------
37 
38 //---- Curl Parameters Start ----
39 $crawlOptions = array(
40 CURLOPT_RETURNTRANSFER => true,     		// return web page
41 CURLOPT_HEADER         => false,    		// don't return headers
42 CURLOPT_FOLLOWLOCATION => true,     		// follow redirects
43 CURLOPT_ENCODING       => "",       		// handle all encodings
44 CURLOPT_USERAGENT      => "AlgoberryBot",	// who am i
45 CURLOPT_AUTOREFERER    => true,     		// set referer on redirect
46 CURLOPT_CONNECTTIMEOUT => 120,      		// timeout on connect
47 CURLOPT_TIMEOUT        => 120,      		// timeout on response
48 CURLOPT_MAXREDIRS      => 0,       			// stop after 10 redirects
49 );
50 //---- Curl Parameters End ------
51 ?>
crawler.php
1 <?php
2 class webCrawler
3 {
4 	public $siteURL;
5 	public $error;
6 	
7 	function __construct()
8 	{
9 		$this->siteURL = "";
10 		$this->error = "";
11 	}
12 
13 	function parser()	
14 	{
15 		global $hrefTag,$hrefTagCountStart,$hrefTagCountFinal,$hrefTagLengthStart,$hrefTagLengthFinal,$hrefTagPointer;
16 		global $imgTag,$imgTagCountStart,$imgTagCountFinal,$imgTagLengthStart,$imgTagLengthFinal,$imgTagPointer;
17 		global $Url_Extensions,$Document_Extensions,$Image_Extensions,$crawlOptions;
18 					
19 		$dotCount = 0;
20 		$slashCount = 0;
21 		$singleSlashCount = 0;
22 		$doubleSlashCount = 0;
23 		$parentDirectoryCount = 0;
24 
25 		$linkBuffer = array();
26 		
27 		if($url = trim$this->siteURL != "")
28 		{
29 			$crawlURL = rtrim($url,"/");
30 			if($directoryURL = dirname$crawlURL == "http:")
31 			{	$directoryURL = $crawlURL;	}
32 			$urlParser = preg_split("/\//",$crawlURL);
33 		
34 			//-- Curl Start --
35 			$curlObject = curl_init($crawlURL);
36 			curl_setopt_array($curlObject,$crawlOptions);
37 			$webPageContent = curl_exec($curlObject);
38 			$errorNumber = curl_errno($curlObject);
39 			curl_close($curlObject);
40 			//-- Curl End --
41 		
42 			if($errorNumber == 0)
43 			{
44 				$webPageCounter = 0;
45 				$webPageLength = strlen($webPageContent);
46 				while($webPageCounter < $webPageLength)
47 				{
48 					$character = $webPageContent[$webPageCounter];
49 					if($character == "")
50 					{	
51 						$webPageCounter++;	
52 						continue;
53 					}
54 					$character = strtolower($character);
55 					//-- Href Filter Start --
56 					if($hrefTagPointer[$hrefTagLengthStart] == $character)
57 					{
58 						$hrefTagLengthStart++;
59 						if($hrefTagLengthStart == $hrefTagLengthFinal)
60 						{
61 							$hrefTagCountStart++;
62 							if($hrefTagCountStart == $hrefTagCountFinal)
63 							{
64 								if($hrefURL != "")
65 								{
66 									if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
67 									{
68 										if($doubleSlashCount >= 1)
69 										{	$hrefURL = "http://".$hrefURL;	}
70 										else if($parentDirectoryCount >= 1)
71 										{
72 											$tempData = 0;
73 											$tempString = "";
74 											$tempTotal = count($urlParser) - $parentDirectoryCount;
75 											while($tempData < $tempTotal)
76 											{
77 												$tempString .= $urlParser[$tempData]."/";
78 												$tempData++;
79 											}
80 											$hrefURL = $tempString."".$hrefURL;
81 										}
82 										else if($singleSlashCount >= 1)
83 										{	$hrefURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$hrefURL;	}
84 									}
85 									$host = "";
86 									$hrefURL = urldecode($hrefURL);
87 									$hrefURL = rtrim($hrefURL,"/");
88 									if(filter_var$hrefURL,FILTER_VALIDATE_URL == true)
89 									{	
90 										$dump = parse_url($hrefURL);
91 										if(isset$dump["host"])
92 										{	$host = trim(strtolower$dump["host"]);	}
93 									}
94 									else
95 									{
96 										$hrefURL = $directoryURL."/".$hrefURL;
97 										if(filter_var$hrefURL,FILTER_VALIDATE_URL == true)
98 										{	
99 											$dump = parse_url($hrefURL);	
100 											if(isset$dump["host"])
101 											{	$host = trim(strtolower$dump["host"]);	}
102 										}
103 									}
104 									if($host != "")
105 									{
106 										$extension = pathinfo($hrefURL,PATHINFO_EXTENSION);
107 										if($extension != "")
108 										{
109 											$tempBuffer ="";
110 											$extensionlength = strlen($extension);
111 											for($tempData = 0; $tempData < $extensionlength; $tempData++)
112 											{
113 												if($extension[$tempData] != "?")
114 												{	
115 													$tempBuffer = $tempBuffer.$extension[$tempData];
116 													continue;
117 												}
118 												else
119 												{
120 													$extension = trim($tempBuffer);
121 													break;
122 												}
123 											}
124 											if(in_array$extension,$Url_Extensions)
125 											{	$type = "domain";	}
126 											else if(in_array$extension,$Image_Extensions)
127 											{	$type = "image";	}
128 											else if(in_array$extension,$Document_Extensions)
129 											{	$type = "document";	}
130 											else
131 											{	$type = "unknown";	}
132 										}
133 										else
134 										{	$type = "domain";	}
135 									
136 										if($hrefURL != "")
137 										{
138 											if($type == "domain" && !in_array$hrefURL,$this->linkBuffer["domain"])
139 											{	$this->linkBuffer["domain"][] = $hrefURL;	}
140 											if($type == "image" && !in_array$hrefURL,$this->linkBuffer["image"])
141 											{	$this->linkBuffer["image"][] = $hrefURL;	}
142 											if($type == "document" && !in_array$hrefURL,$this->linkBuffer["document"])
143 											{	$this->linkBuffer["document"][] = $hrefURL;	}
144 											if($type == "unknown" && !in_array$hrefURL,$this->linkBuffer["unknown"])
145 											{	$this->linkBuffer["unknown"][] = $hrefURL;	}
146 										}
147 									}
148 								}
149 								$hrefTagCountStart = 0;
150 							}
151 							if($hrefTagCountStart == 3)
152 							{
153 								$hrefURL = "";
154 								$dotCount = 0;
155 								$slashCount = 0;
156 								$singleSlashCount = 0;
157 								$doubleSlashCount = 0;
158 								$parentDirectoryCount = 0;
159 								$webPageCounter++;
160 								while($webPageCounter < $webPageLength)
161 								{
162 									$character = $webPageContent[$webPageCounter];
163 									if($character == "")
164 									{	
165 										$webPageCounter++;	
166 										continue;
167 									}
168 									if($character == "\"" || $character == "'")
169 									{
170 										$webPageCounter++;
171 										while($webPageCounter < $webPageLength)
172 										{
173 											$character = $webPageContent[$webPageCounter];
174 											if($character == "")
175 											{	
176 												$webPageCounter++;	
177 												continue;
178 											}
179 											if($character == "\"" || $character == "'" || $character == "#")
180 											{	
181 												$webPageCounter--;	
182 												break;	
183 											}
184 											else if($hrefURL != "")
185 											{	$hrefURL .= $character;	}
186 											else if($character == "." || $character == "/")
187 											{
188 												if($character == ".")
189 												{
190 													$dotCount++;
191 													$slashCount = 0;
192 												}
193 												else if($character == "/")
194 												{
195 													$slashCount++;
196 													if($dotCount == 2 && $slashCount == 1)
197 													$parentDirectoryCount++;
198 													else if($dotCount == 0 && $slashCount == 1)
199 													$singleSlashCount++;
200 													else if($dotCount == 0 && $slashCount == 2)
201 													$doubleSlashCount++;
202 													$dotCount = 0;
203 												}
204 											}
205 											else
206 											{	$hrefURL .= $character;	}
207 											$webPageCounter++;
208 										}
209 										break;
210 									}
211 									$webPageCounter++;
212 								}
213 							}
214 							$hrefTagLengthStart = 0;
215 							$hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
216 							$hrefTagPointer =& $hrefTag[$hrefTagCountStart];
217 						}
218 					}
219 					else
220 					{	$hrefTagLengthStart = 0;	}
221 					//-- Href Filter End --
222 					//-- Image Filter Start --
223 					if($imgTagPointer[$imgTagLengthStart] == $character)
224 					{
225 						$imgTagLengthStart++;
226 						if($imgTagLengthStart == $imgTagLengthFinal)
227 						{
228 							$imgTagCountStart++;
229 							if($imgTagCountStart == $imgTagCountFinal)
230 							{
231 								if($imgURL != "")
232 								{
233 									if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
234 									{
235 										if($doubleSlashCount >= 1)
236 										{	$imgURL = "http://".$imgURL;	}
237 										else if($parentDirectoryCount >= 1)
238 										{
239 											$tempData = 0;
240 											$tempString = "";
241 											$tempTotal = count($urlParser) - $parentDirectoryCount;
242 											while($tempData < $tempTotal)
243 											{
244 												$tempString .= $urlParser[$tempData]."/";
245 												$tempData++;
246 											}
247 											$imgURL = $tempString."".$imgURL;
248 										}
249 										else if($singleSlashCount >= 1)
250 										{	$imgURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$imgURL;	}
251 									}
252 									$host = "";
253 									$imgURL = urldecode($imgURL);
254 									$imgURL = rtrim($imgURL,"/");
255 									if(filter_var$imgURL,FILTER_VALIDATE_URL == true)
256 									{	
257 										$dump = parse_url($imgURL);	
258 										$host = trim(strtolower$dump["host"]);
259 									}
260 									else
261 									{
262 										$imgURL = $directoryURL."/".$imgURL;
263 										if(filter_var$imgURL,FILTER_VALIDATE_URL == true)
264 										{	
265 											$dump = parse_url($imgURL);	
266 											$host = trim(strtolower$dump["host"]);
267 										}	
268 									}
269 									if($host != "")
270 									{
271 										$extension = pathinfo($imgURL,PATHINFO_EXTENSION);
272 										if($extension != "")
273 										{
274 											$tempBuffer ="";
275 											$extensionlength = strlen($extension);
276 											for($tempData = 0; $tempData < $extensionlength; $tempData++)
277 											{
278 												if($extension[$tempData] != "?")
279 												{	
280 													$tempBuffer = $tempBuffer.$extension[$tempData];
281 													continue;
282 												}
283 												else
284 												{
285 													$extension = trim($tempBuffer);
286 													break;
287 												}
288 											}
289 											if(in_array$extension,$Url_Extensions)
290 											{	$type = "domain";	}
291 											else if(in_array$extension,$Image_Extensions)
292 											{	$type = "image";	}
293 											else if(in_array$extension,$Document_Extensions)
294 											{	$type = "document";	}
295 											else
296 											{	$type = "unknown";	}
297 										}
298 										else
299 										{	$type = "domain";	}
300 									
301 										if($imgURL != "")
302 										{
303 											if($type == "domain" && !in_array$imgURL,$this->linkBuffer["domain"])
304 											{	$this->linkBuffer["domain"][] = $imgURL;	}
305 											if($type == "image" && !in_array$imgURL,$this->linkBuffer["image"])
306 											{	$this->linkBuffer["image"][] = $imgURL;	}
307 											if($type == "document" && !in_array$imgURL,$this->linkBuffer["document"])
308 											{	$this->linkBuffer["document"][] = $imgURL;	}
309 											if($type == "unknown" && !in_array$imgURL,$this->linkBuffer["unknown"])
310 											{	$this->linkBuffer["unknown"][] = $imgURL;	}
311 										}
312 									}
313 								}
314 								$imgTagCountStart = 0;
315 							}
316 							if($imgTagCountStart == 3)
317 							{
318 								$imgURL = "";
319 								$dotCount = 0;
320 								$slashCount = 0;
321 								$singleSlashCount = 0;
322 								$doubleSlashCount = 0;
323 								$parentDirectoryCount = 0;
324 								$webPageCounter++;
325 								while($webPageCounter < $webPageLength)
326 								{
327 									$character = $webPageContent[$webPageCounter];
328 									if($character == "")
329 									{	
330 										$webPageCounter++;	
331 										continue;
332 									}
333 									if($character == "\"" || $character == "'")
334 									{
335 										$webPageCounter++;
336 										while($webPageCounter < $webPageLength)
337 										{
338 											$character = $webPageContent[$webPageCounter];
339 											if($character == "")
340 											{	
341 												$webPageCounter++;	
342 												continue;
343 											}
344 											if($character == "\"" || $character == "'" || $character == "#")
345 											{	
346 												$webPageCounter--;	
347 												break;	
348 											}
349 											else if($imgURL != "")
350 											{	$imgURL .= $character;	}
351 											else if($character == "." || $character == "/")
352 											{
353 												if($character == ".")
354 												{
355 													$dotCount++;
356 													$slashCount = 0;
357 												}
358 												else if($character == "/")
359 												{
360 													$slashCount++;
361 													if($dotCount == 2 && $slashCount == 1)
362 													$parentDirectoryCount++;
363 													else if($dotCount == 0 && $slashCount == 1)
364 													$singleSlashCount++;
365 													else if($dotCount == 0 && $slashCount == 2)
366 													$doubleSlashCount++;
367 													$dotCount = 0;
368 												}
369 											}
370 											else
371 											{	$imgURL .= $character;	}
372 											$webPageCounter++;
373 										}
374 										break;
375 									}
376 									$webPageCounter++;
377 								}
378 							}
379 							$imgTagLengthStart = 0;
380 							$imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
381 							$imgTagPointer =& $imgTag[$imgTagCountStart];
382 						}
383 					}
384 					else
385 					{	$imgTagLengthStart = 0;	}
386 					//-- Image Filter End --
387 					$webPageCounter++;
388 				}
389 			}
390 			else
391 			{	$this->error = "Unable to proceed, permission denied";	}
392 		}
393 		else
394 		{	$this->error = "Please enter url";	}
395 	
396 		if($this->error != "")
397 		{	$this->linkBuffer["error"] = $this->error;	}
398 		
399 		return $this->linkBuffer;
400 	}	
401 }
402 ?>
index.php
1 <?php
2 include_once("config.php");
3 include_once("crawler.php");
4 ?>
5 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
6 <html>
7 <head>
8 	<title>Web Crawler</title>
9 	<style>
10 	.textbox
11 	{ 
12 		border: medium none;
13 		font-family: Arial,Sans-Serif;
14 		font-size: 16px;
15 		height:28px;
16 		line-height: 24px;
17 		width: 578px;
18 		border: 1px solid #ACBABD;
19 		padding:5px;
20 	}
21 	.submitbox 
22 	{	
23 		height:40px; 
24 		width:80px; 
25 		text-align:center;
26 	} 
27 	</style>
28 </head>
29 <body>
30 	<div style="padding:10px;">
31 		<div>
32 			<form name="crawlsearch" method="post" action="index.php">
33 			<table>
34 			<tr>
35 				<td align="center" colspan="2"><b>Web Crawl</b></td>
36 			</tr>
37 			<tr>
38 				<td>
39 					<input class="textbox" type="text" placeholder="Enter URL" name="url" value="<?php if(isset($_POST["url"])) { echo $_POST["url"]; } ?>">
40 				</td>
41 				<td>
42 					<input class="submitbox" type="submit" name= "SubmitBox" value="Crawl">
43 				</td>
44 			</tr>
45 			</table>
46 			</form>
47 		</div>
48 	</div>
49 	<div style="padding:10px;">
50 	<?php
51 	if(isset$_POST["SubmitBox"])
52 	{
53 		$obj = new webCrawler();
54 		$obj->siteURL = $_POST["url"];	
55 		$returnData = $obj->parser();
56 		echo "<pre>";	
57 		print_r($returnData);
58 		echo "</pre>";
59 	}
60 	?>
61 	</div>
62 </body>
63 <html>