How to create a Web Crawler using PHP

The web crawler is a computer program which used to collect/crawling the following key values(HREF links, Image links, Metadata.etc) from a given website URL. It is designed like intelligent to follow different HREF links which are already fetched from the previous URL, so in this way, Crawler can jump from one website to other websites. Usually, it called a Web spider or Web Bot. This mechanism always acts as the backbone of the Web search engine.
config.php
1 <?php
2 //---- Href Element Syntax Start ----
3 $hrefTag = array();
4 $hrefTag[0] = "<a";
5 $hrefTag[1] = "href";
6 $hrefTag[2] = "=";
7 $hrefTag[3] = ">";
8 $hrefTagCountStart = 0;
9 $hrefTagCountFinal = count($hrefTag);
10 $hrefTagLengthStart = 0;
11 $hrefTagLengthFinal = strlen($hrefTag[0]);
12 $hrefTagPointer =& $hrefTag[0];
13 //---- Href Element Syntax End ----
14 
15 //---- Image Element Syntax Start ----
16 $imgTag = array();
17 $imgTag[0] = "<img";
18 $imgTag[1] = "src";
19 $imgTag[2] = "=";
20 $imgTag[3] = ">";
21 $imgTagCountStart = 0;
22 $imgTagCountFinal = count($imgTag);
23 $imgTagLengthStart = 0;
24 $imgTagLengthFinal = strlen($imgTag[0]);
25 $imgTagPointer =& $imgTag[0];
26 //---- Image Element Syntax End ----
27 
28 //---- Valid Domain Start -----
29 $Url_Extensions = array("asp","aspx","html","htm","php","php3","biz","com","edu","gov","info","int","jobs",
30 "net","org","in","us","uk");
31 //---- Valid Domain End -------
32 
33 //---- Valid File Extension Start ----
34 $Document_Extensions = array("doc","pdf","ppt","txt");
35 $Image_Extensions = array("gif","jpeg","jpg","png");
36 //---- Valid File Extension End ------
37 
38 //---- Curl Parameters Start ----
39 $crawlOptions = array(
40 CURLOPT_RETURNTRANSFER => true,             // return web page
41 CURLOPT_HEADER         => false,            // don't return headers
42 CURLOPT_FOLLOWLOCATION => true,             // follow redirects
43 CURLOPT_ENCODING       => "",               // handle all encodings
44 CURLOPT_USERAGENT      => "AlgoberryBot",    // who am i
45 CURLOPT_AUTOREFERER    => true,             // set referer on redirect
46 CURLOPT_CONNECTTIMEOUT => 120,              // timeout on connect
47 CURLOPT_TIMEOUT        => 120,              // timeout on response
48 CURLOPT_MAXREDIRS      => 0,                   // stop after 10 redirects
49 );
50 //---- Curl Parameters End ------
51 ?>

crawler.php
1 <?php
2 class webCrawler
3 {
4     public $siteURL;
5     public $error;
6     
7     function __construct()
8     {
9         $this->siteURL = "";
10         $this->error = "";
11     }
12 
13     function parser()    
14     {
15         global $hrefTag,$hrefTagCountStart,$hrefTagCountFinal,$hrefTagLengthStart,$hrefTagLengthFinal,$hrefTagPointer;
16         global $imgTag,$imgTagCountStart,$imgTagCountFinal,$imgTagLengthStart,$imgTagLengthFinal,$imgTagPointer;
17         global $Url_Extensions,$Document_Extensions,$Image_Extensions,$crawlOptions;
18                     
19         $dotCount = 0;
20         $slashCount = 0;
21         $singleSlashCount = 0;
22         $doubleSlashCount = 0;
23         $parentDirectoryCount = 0;
24 
25         $linkBuffer = array();
26         
27         if(($url = trim($this->siteURL)) != "")
28         {
29             $crawlURL = rtrim($url,"/");
30             if(($directoryURL = dirname($crawlURL)) == "http:")
31             {    $directoryURL = $crawlURL;    }
32             $urlParser = preg_split("/\//",$crawlURL);
33         
34             //-- Curl Start --
35             $curlObject = curl_init($crawlURL);
36             curl_setopt_array($curlObject,$crawlOptions);
37             $webPageContent = curl_exec($curlObject);
38             $errorNumber = curl_errno($curlObject);
39             curl_close($curlObject);
40             //-- Curl End --
41         
42             if($errorNumber == 0)
43             {
44                 $webPageCounter = 0;
45                 $webPageLength = strlen($webPageContent);
46                 while($webPageCounter < $webPageLength)
47                 {
48                     $character = $webPageContent[$webPageCounter];
49                     if($character == "")
50                     {    
51                         $webPageCounter++;    
52                         continue;
53                     }
54                     $character = strtolower($character);
55                     //-- Href Filter Start --
56                     if($hrefTagPointer[$hrefTagLengthStart] == $character)
57                     {
58                         $hrefTagLengthStart++;
59                         if($hrefTagLengthStart == $hrefTagLengthFinal)
60                         {
61                             $hrefTagCountStart++;
62                             if($hrefTagCountStart == $hrefTagCountFinal)
63                             {
64                                 if($hrefURL != "")
65                                 {
66                                     if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
67                                     {
68                                         if($doubleSlashCount >= 1)
69                                         {    $hrefURL = "http://".$hrefURL;    }
70                                         else if($parentDirectoryCount >= 1)
71                                         {
72                                             $tempData = 0;
73                                             $tempString = "";
74                                             $tempTotal = count($urlParser) - $parentDirectoryCount;
75                                             while($tempData < $tempTotal)
76                                             {
77                                                 $tempString .= $urlParser[$tempData]."/";
78                                                 $tempData++;
79                                             }
80                                             $hrefURL = $tempString."".$hrefURL;
81                                         }
82                                         else if($singleSlashCount >= 1)
83                                         {    $hrefURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$hrefURL;    }
84                                     }
85                                     $host = "";
86                                     $hrefURL = urldecode($hrefURL);
87                                     $hrefURL = rtrim($hrefURL,"/");
88                                     if(filter_var($hrefURL,FILTER_VALIDATE_URL) == true)
89                                     {    
90                                         $dump = parse_url($hrefURL);
91                                         if(isset($dump["host"]))
92                                         {    $host = trim(strtolower($dump["host"]));    }
93                                     }
94                                     else
95                                     {
96                                         $hrefURL = $directoryURL."/".$hrefURL;
97                                         if(filter_var($hrefURL,FILTER_VALIDATE_URL) == true)
98                                         {    
99                                             $dump = parse_url($hrefURL);    
100                                             if(isset($dump["host"]))
101                                             {    $host = trim(strtolower($dump["host"]));    }
102                                         }
103                                     }
104                                     if($host != "")
105                                     {
106                                         $extension = pathinfo($hrefURL,PATHINFO_EXTENSION);
107                                         if($extension != "")
108                                         {
109                                             $tempBuffer ="";
110                                             $extensionlength = strlen($extension);
111                                             for($tempData = 0; $tempData < $extensionlength; $tempData++)
112                                             {
113                                                 if($extension[$tempData] != "?")
114                                                 {    
115                                                     $tempBuffer = $tempBuffer.$extension[$tempData];
116                                                     continue;
117                                                 }
118                                                 else
119                                                 {
120                                                     $extension = trim($tempBuffer);
121                                                     break;
122                                                 }
123                                             }
124                                             if(in_array($extension,$Url_Extensions))
125                                             {    $type = "domain";    }
126                                             else if(in_array($extension,$Image_Extensions))
127                                             {    $type = "image";    }
128                                             else if(in_array($extension,$Document_Extensions))
129                                             {    $type = "document";    }
130                                             else
131                                             {    $type = "unknown";    }
132                                         }
133                                         else
134                                         {    $type = "domain";    }
135                                     
136                                         if($hrefURL != "")
137                                         {
138                                             if($type == "domain" && !in_array($hrefURL,$this->linkBuffer["domain"]))
139                                             {    $this->linkBuffer["domain"][] = $hrefURL;    }
140                                             if($type == "image" && !in_array($hrefURL,$this->linkBuffer["image"]))
141                                             {    $this->linkBuffer["image"][] = $hrefURL;    }
142                                             if($type == "document" && !in_array($hrefURL,$this->linkBuffer["document"]))
143                                             {    $this->linkBuffer["document"][] = $hrefURL;    }
144                                             if($type == "unknown" && !in_array($hrefURL,$this->linkBuffer["unknown"]))
145                                             {    $this->linkBuffer["unknown"][] = $hrefURL;    }
146                                         }
147                                     }
148                                 }
149                                 $hrefTagCountStart = 0;
150                             }
151                             if($hrefTagCountStart == 3)
152                             {
153                                 $hrefURL = "";
154                                 $dotCount = 0;
155                                 $slashCount = 0;
156                                 $singleSlashCount = 0;
157                                 $doubleSlashCount = 0;
158                                 $parentDirectoryCount = 0;
159                                 $webPageCounter++;
160                                 while($webPageCounter < $webPageLength)
161                                 {
162                                     $character = $webPageContent[$webPageCounter];
163                                     if($character == "")
164                                     {    
165                                         $webPageCounter++;    
166                                         continue;
167                                     }
168                                     if($character == "\"" || $character == "'")
169                                     {
170                                         $webPageCounter++;
171                                         while($webPageCounter < $webPageLength)
172                                         {
173                                             $character = $webPageContent[$webPageCounter];
174                                             if($character == "")
175                                             {    
176                                                 $webPageCounter++;    
177                                                 continue;
178                                             }
179                                             if($character == "\"" || $character == "'" || $character == "#")
180                                             {    
181                                                 $webPageCounter--;    
182                                                 break;    
183                                             }
184                                             else if($hrefURL != "")
185                                             {    $hrefURL .= $character;    }
186                                             else if($character == "." || $character == "/")
187                                             {
188                                                 if($character == ".")
189                                                 {
190                                                     $dotCount++;
191                                                     $slashCount = 0;
192                                                 }
193                                                 else if($character == "/")
194                                                 {
195                                                     $slashCount++;
196                                                     if($dotCount == 2 && $slashCount == 1)
197                                                     $parentDirectoryCount++;
198                                                     else if($dotCount == 0 && $slashCount == 1)
199                                                     $singleSlashCount++;
200                                                     else if($dotCount == 0 && $slashCount == 2)
201                                                     $doubleSlashCount++;
202                                                     $dotCount = 0;
203                                                 }
204                                             }
205                                             else
206                                             {    $hrefURL .= $character;    }
207                                             $webPageCounter++;
208                                         }
209                                         break;
210                                     }
211                                     $webPageCounter++;
212                                 }
213                             }
214                             $hrefTagLengthStart = 0;
215                             $hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
216                             $hrefTagPointer =& $hrefTag[$hrefTagCountStart];
217                         }
218                     }
219                     else
220                     {    $hrefTagLengthStart = 0;    }
221                     //-- Href Filter End --
222                     //-- Image Filter Start --
223                     if($imgTagPointer[$imgTagLengthStart] == $character)
224                     {
225                         $imgTagLengthStart++;
226                         if($imgTagLengthStart == $imgTagLengthFinal)
227                         {
228                             $imgTagCountStart++;
229                             if($imgTagCountStart == $imgTagCountFinal)
230                             {
231                                 if($imgURL != "")
232                                 {
233                                     if($parentDirectoryCount >= 1 || $singleSlashCount >= 1 || $doubleSlashCount >= 1)
234                                     {
235                                         if($doubleSlashCount >= 1)
236                                         {    $imgURL = "http://".$imgURL;    }
237                                         else if($parentDirectoryCount >= 1)
238                                         {
239                                             $tempData = 0;
240                                             $tempString = "";
241                                             $tempTotal = count($urlParser) - $parentDirectoryCount;
242                                             while($tempData < $tempTotal)
243                                             {
244                                                 $tempString .= $urlParser[$tempData]."/";
245                                                 $tempData++;
246                                             }
247                                             $imgURL = $tempString."".$imgURL;
248                                         }
249                                         else if($singleSlashCount >= 1)
250                                         {    $imgURL = $urlParser[0]."/".$urlParser[1]."/".$urlParser[2]."/".$imgURL;    }
251                                     }
252                                     $host = "";
253                                     $imgURL = urldecode($imgURL);
254                                     $imgURL = rtrim($imgURL,"/");
255                                     if(filter_var($imgURL,FILTER_VALIDATE_URL) == true)
256                                     {    
257                                         $dump = parse_url($imgURL);    
258                                         $host = trim(strtolower($dump["host"]));
259                                     }
260                                     else
261                                     {
262                                         $imgURL = $directoryURL."/".$imgURL;
263                                         if(filter_var($imgURL,FILTER_VALIDATE_URL) == true)
264                                         {    
265                                             $dump = parse_url($imgURL);    
266                                             $host = trim(strtolower($dump["host"]));
267                                         }    
268                                     }
269                                     if($host != "")
270                                     {
271                                         $extension = pathinfo($imgURL,PATHINFO_EXTENSION);
272                                         if($extension != "")
273                                         {
274                                             $tempBuffer ="";
275                                             $extensionlength = strlen($extension);
276                                             for($tempData = 0; $tempData < $extensionlength; $tempData++)
277                                             {
278                                                 if($extension[$tempData] != "?")
279                                                 {    
280                                                     $tempBuffer = $tempBuffer.$extension[$tempData];
281                                                     continue;
282                                                 }
283                                                 else
284                                                 {
285                                                     $extension = trim($tempBuffer);
286                                                     break;
287                                                 }
288                                             }
289                                             if(in_array($extension,$Url_Extensions))
290                                             {    $type = "domain";    }
291                                             else if(in_array($extension,$Image_Extensions))
292                                             {    $type = "image";    }
293                                             else if(in_array($extension,$Document_Extensions))
294                                             {    $type = "document";    }
295                                             else
296                                             {    $type = "unknown";    }
297                                         }
298                                         else
299                                         {    $type = "domain";    }
300                                     
301                                         if($imgURL != "")
302                                         {
303                                             if($type == "domain" && !in_array($imgURL,$this->linkBuffer["domain"]))
304                                             {    $this->linkBuffer["domain"][] = $imgURL;    }
305                                             if($type == "image" && !in_array($imgURL,$this->linkBuffer["image"]))
306                                             {    $this->linkBuffer["image"][] = $imgURL;    }
307                                             if($type == "document" && !in_array($imgURL,$this->linkBuffer["document"]))
308                                             {    $this->linkBuffer["document"][] = $imgURL;    }
309                                             if($type == "unknown" && !in_array($imgURL,$this->linkBuffer["unknown"]))
310                                             {    $this->linkBuffer["unknown"][] = $imgURL;    }
311                                         }
312                                     }
313                                 }
314                                 $imgTagCountStart = 0;
315                             }
316                             if($imgTagCountStart == 3)
317                             {
318                                 $imgURL = "";
319                                 $dotCount = 0;
320                                 $slashCount = 0;
321                                 $singleSlashCount = 0;
322                                 $doubleSlashCount = 0;
323                                 $parentDirectoryCount = 0;
324                                 $webPageCounter++;
325                                 while($webPageCounter < $webPageLength)
326                                 {
327                                     $character = $webPageContent[$webPageCounter];
328                                     if($character == "")
329                                     {    
330                                         $webPageCounter++;    
331                                         continue;
332                                     }
333                                     if($character == "\"" || $character == "'")
334                                     {
335                                         $webPageCounter++;
336                                         while($webPageCounter < $webPageLength)
337                                         {
338                                             $character = $webPageContent[$webPageCounter];
339                                             if($character == "")
340                                             {    
341                                                 $webPageCounter++;    
342                                                 continue;
343                                             }
344                                             if($character == "\"" || $character == "'" || $character == "#")
345                                             {    
346                                                 $webPageCounter--;    
347                                                 break;    
348                                             }
349                                             else if($imgURL != "")
350                                             {    $imgURL .= $character;    }
351                                             else if($character == "." || $character == "/")
352                                             {
353                                                 if($character == ".")
354                                                 {
355                                                     $dotCount++;
356                                                     $slashCount = 0;
357                                                 }
358                                                 else if($character == "/")
359                                                 {
360                                                     $slashCount++;
361                                                     if($dotCount == 2 && $slashCount == 1)
362                                                     $parentDirectoryCount++;
363                                                     else if($dotCount == 0 && $slashCount == 1)
364                                                     $singleSlashCount++;
365                                                     else if($dotCount == 0 && $slashCount == 2)
366                                                     $doubleSlashCount++;
367                                                     $dotCount = 0;
368                                                 }
369                                             }
370                                             else
371                                             {    $imgURL .= $character;    }
372                                             $webPageCounter++;
373                                         }
374                                         break;
375                                     }
376                                     $webPageCounter++;
377                                 }
378                             }
379                             $imgTagLengthStart = 0;
380                             $imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
381                             $imgTagPointer =& $imgTag[$imgTagCountStart];
382                         }
383                     }
384                     else
385                     {    $imgTagLengthStart = 0;    }
386                     //-- Image Filter End --
387                     $webPageCounter++;
388                 }
389             }
390             else
391             {    $this->error = "Unable to proceed, permission denied";    }
392         }
393         else
394         {    $this->error = "Please enter url";    }
395     
396         if($this->error != "")
397         {    $this->linkBuffer["error"] = $this->error;    }
398         
399         return $this->linkBuffer;
400     }    
401 }
402 ?>

index.php
1 <?php
2 include_once("config.php");
3 include_once("crawler.php");
4 ?>
5 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
6 <html>
7 <head>
8     <title>Web Crawler</title>
9     <style>
10     .textbox
11     { 
12         border: medium none;
13         font-family: Arial,Sans-Serif;
14         font-size: 16px;
15         height:28px;
16         line-height: 24px;
17         width: 578px;
18         border: 1px solid #ACBABD;
19         padding:5px;
20     }
21     .submitbox 
22     {    
23         height:40px; 
24         width:80px; 
25         text-align:center;
26     } 
27     </style>
28 </head>
29 <body>
30     <div style="padding:10px;">
31         <div>
32             <form name="crawlsearch" method="post" action="index.php">
33             <table>
34             <tr>
35                 <td align="center" colspan="2"><b>Web Crawl</b></td>
36             </tr>
37             <tr>
38                 <td>
39                     <input class="textbox" type="text" placeholder="Enter URL" name="url" value="<?php if(isset($_POST["url"])) { echo $_POST["url"]; } ?>">
40                 </td>
41                 <td>
42                     <input class="submitbox" type="submit" name= "SubmitBox" value="Crawl">
43                 </td>
44             </tr>
45             </table>
46             </form>
47         </div>
48     </div>
49     <div style="padding:10px;">
50     <?php
51     if(isset($_POST["SubmitBox"]))
52     {
53         $obj = new webCrawler();
54         $obj->siteURL = $_POST["url"];    
55         $returnData = $obj->parser();
56         echo "<pre>";    
57         print_r($returnData);
58         echo "</pre>";
59     }
60     ?>
61     </div>
62 </body>
63 <html>

No comments:

Post a Comment