Building a PHP Web Crawler from Scratch

A web crawler, also known as a web spider,. is an automated program or script that systematically navigates through websites on the Internet to gather information. Web crawlers start by visiting a seed URL and then follow hyperlinks to other pages, recursively exploring and indexing the content they find.The primary purpose of a web crawler is to gather data from web pages, such as text, images, links, metadata, and other relevant information. This collected data is typically used for various purposes, including web indexing, data mining, content scraping, and search engine optimization. Web crawlers work by sending HTTP requests to web servers, downloading web pages, parsing the HTML or other structured data, and extracting relevant information. They follow the links found on each page to discover new URLs to crawl, creating a vast network of interconnected web pages.

You can get the complete code from Github

index.php
1 <?php
2 include_once("config.php");
3 include_once("WebCrawler.php");
4 
5 $obj = new WebCrawler();
6 $data = $obj->parser("https://www.algoberry.com");
7 echo "<pre>";
8 print_r($data);
9 echo "</pre>";
10 ?>
config.php
1 <?php
2 //--
3 $outerHead = "<head>";
4 $outerHeadLength = strlen($outerHead);
5 $outerHeadStart = 0;
6 
7 $innerHead = "</head>";
8 $innerHeadLength = strlen($innerHead);
9 $innerHeadStart = 0;
10 //--
11 
12 //--
13 $outerTitle = "<title>";
14 $outerTitleLength = strlen($outerTitle);
15 $outerTitleStart = 0;
16 
17 $innerTitle = "</title>";
18 $innerTitleLength = strlen($innerTitle);
19 $innerTitleStart = 0;
20 //--
21 
22 //--
23 $outerMeta = "<meta";
24 $innerMeta = ">";
25 $metaPointer = 0;
26 //--
27 
28 //--
29 $metaNameBase = "name=";
30 $metaNamePointer = 0;
31 //--
32 
33 //--
34 $metaPropertyBase = "property=";
35 $metaPropertyPointer = 0;
36 //--
37 
38 //--
39 $metaContentBase = "content=";
40 $metaContentPointer = 0;
41 //--
42 
43 //--
44 $hrefTag = array();
45 $hrefTag[0] = "<a";
46 $hrefTag[1] = "href";
47 $hrefTag[2] = "=";
48 $hrefTag[3] = ">";
49 $hrefTagCountStart = 0;
50 $hrefTagCountFinal = count($hrefTag);
51 $hrefTagLengthStart = 0;
52 $hrefTagLengthFinal = strlen($hrefTag[0]);
53 $hrefTagPointer =& $hrefTag[0];
54 //--
55 
56 //--
57 $imgTag = array();
58 $imgTag[0] = "<img";
59 $imgTag[1] = "src";
60 $imgTag[2] = "=";
61 $imgTag[3] = ">";
62 $imgTagCountStart = 0;
63 $imgTagCountFinal = count($imgTag);
64 $imgTagLengthStart = 0;
65 $imgTagLengthFinal = strlen($imgTag[0]);
66 $imgTagPointer =& $imgTag[0];
67 //--
68 
69 //--
70 $crawlOptions = array(
71 CURLOPT_RETURNTRANSFER => true,     		// return web page
72 CURLOPT_HEADER         => false,    		// don't return headers
73 CURLOPT_FOLLOWLOCATION => true,     		// follow redirects
74 CURLOPT_ENCODING       => "",       		// handle all encodings
75 CURLOPT_USERAGENT      => "algoberrybot", 	// who am i
76 CURLOPT_AUTOREFERER    => true,     		// set referer on redirect
77 CURLOPT_CONNECTTIMEOUT => 10,      			// timeout on connect
78 CURLOPT_TIMEOUT        => 10,      			// timeout on response
79 CURLOPT_MAXREDIRS      => 0       			// stop after 10 redirects
80 );
81 //--
82 ?>
WebCrawler.php
1 <?php
2 Class WebCrawler {
3 
4     public $webPageContent;
5 
6     function parser($directoryURL) {
7         $result = array();
8         global $outerHead, $outerHeadLength, $outerHeadStart;
9         global $innerHead, $innerHeadLength, $innerHeadStart;
10         global $outerTitle, $outerTitleLength, $outerTitleStart;
11         global $innerTitle, $innerTitleLength, $innerTitleStart;
12 
13         global $outerMeta, $innerMeta, $metaPointer;
14         global $metaNameBase, $metaNamePointer;
15         global $metaPropertyBase, $metaPropertyPointer;
16         global $metaContentBase, $metaContentPointer;
17 
18         global $hrefTag, $hrefTagCountStart, $hrefTagCountFinal, $hrefTagLengthStart, $hrefTagLengthFinal, $hrefTagPointer;
19         global $imgTag, $imgTagCountStart, $imgTagCountFinal, $imgTagLengthStart, $imgTagLengthFinal, $imgTagPointer;
20         global $crawlOptions;
21 
22         if($directoryURL != "") {
23             if(filter_var$directoryURL,FILTER_VALIDATE_URL == true) {
24                 $hrefURL = "";
25                 $imgURL = "";
26                 $previousDirectoryCount = 0;
27                 $currentDirectoryCount = 0;
28                 $singleSlashCount = 0;
29                 $doubleSlashCount = 0;
30 
31                 $urlParser = preg_split("/\//",$directoryURL);
32                 $dump = parse_url($directoryURL);
33                 $ownHost = $dump["host"]; 
34 
35                 $curlObject = curl_init($directoryURL);
36                 curl_setopt_array($curlObject,$crawlOptions);
37                 $this->webPageContent = curl_exec($curlObject);
38                 $errorNumber = curl_errno($curlObject);
39                 curl_close($curlObject);
40 
41                 if($errorNumber == 0) {
42                     //--
43                     $counter = 0;
44                     $contentLength = strlen($this->webPageContent);
45                     while($counter < $contentLength) {
46                         $character = $this->webPageContent[$counter];
47                         if($character == " ") {	
48                             $counter++;	
49                             continue;
50                         }
51                         if($outerHead[$outerHeadStart] == $character) {
52                             $outerHeadStart++;
53                             if($outerHeadStart == $outerHeadLength) {
54                                 $outerHeadStart = 0;
55                                 $counter++;
56                                 while($counter < $contentLength) {
57                                     $character = $this->webPageContent[$counter];
58                                     if($character == " ") {	
59                                         $counter++;	
60                                         continue;
61                                     }
62                                     //--
63                                     if($outerTitle[$outerTitleStart] == $character) {
64                                         $outerTitleStart++;
65                                         if($outerTitleStart == $outerTitleLength) {
66                                             $outerTitleStart = 0;
67                                             $counter++;
68                                             $startPosition = $counter;
69                                             while($counter < $contentLength) {
70                                                 $character = $this->webPageContent[$counter];
71                                                 if($character == " ") {	
72                                                     $counter++;	
73                                                     continue;
74                                                 }
75                                                 if($innerTitle[$innerTitleStart] == $character) {
76                                                     if($innerTitleStart == 0) {
77                                                         $endPosition = $counter - 1;
78                                                     }
79                                                     $innerTitleStart++;
80                                                     if($innerTitleStart == $innerTitleLength) {
81                                                         $innerTitleStart = 0;
82                                                         $result["title"] = $this->collectData($startPosition,$endPosition);
83                                                         break;
84                                                     }
85                                                 }
86                                                 else
87                                                 {
88                                                     $innerTitleStart = 0;
89                                                 }
90                                                 $counter++;
91                                             }
92                                         }
93                                     }
94                                     else
95                                     {
96                                         $outerTitleStart = 0;
97                                     }
98                                     //--
99                                     //--
100                                     if($outerMeta[$metaPointer] == $character) {
101                                         $metaPointer++;
102                                         if($metaPointer == strlen$outerMeta) {
103                                             $metaPointer = 0;
104                                             $startPosition = 0;
105                                             $endPosition = 0;
106                                             $metaType = "";
107                                             $metaValue = "";
108                                             $counter++;
109                                             while($counter < $contentLength) {
110                                                 $character = $this->webPageContent[$counter];
111                                                 if($character == " ") {	
112                                                     $counter++;	
113                                                     continue;
114                                                 }
115                                                 if($metaNameBase[$metaNamePointer] == $character) {
116                                                     $metaNamePointer++;
117                                                     if($metaNamePointer == strlen$metaNameBase) {
118                                                         $metaNamePointer = 0;
119                                                         $counter++;
120                                                         while($counter < $contentLength) {
121                                                             $character = $this->webPageContent[$counter];
122                                                             if($character == "\"" || $character == "'") {
123                                                                 if($startPosition == 0) {
124                                                                     $startPosition = $counter + 1;
125                                                                 }
126                                                                 else if($endPosition == 0) {
127                                                                     $endPosition = $counter - 1;
128                                                                     break;
129                                                                 }
130                                                             }
131                                                             $counter++;
132                                                         }
133                                                         $metaType = $this->collectData($startPosition,$endPosition);
134                                                         $startPosition = 0;
135                                                         $endPosition = 0;
136                                                     }
137                                                 }
138                                                 else
139                                                 {
140                                                     $metaNamePointer = 0;
141                                                 }
142                                                 if($metaPropertyBase[$metaPropertyPointer] == $character) {
143                                                     $metaPropertyPointer++;
144                                                     if($metaPropertyPointer == strlen$metaPropertyBase) {
145                                                         $metaPropertyPointer = 0;
146                                                         $counter++;
147                                                         while($counter < $contentLength) {
148                                                             $character = $this->webPageContent[$counter];
149                                                             if($character == "\"" || $character == "'") {
150                                                                 if($startPosition == 0) {
151                                                                     $startPosition = $counter + 1;
152                                                                 }
153                                                                 else if($endPosition == 0) {
154                                                                     $endPosition = $counter - 1;
155                                                                     break;
156                                                                 }
157                                                             }
158                                                             $counter++;
159                                                         }
160                                                         $metaType = $this->collectData($startPosition,$endPosition);
161                                                         $startPosition = 0;
162                                                         $endPosition = 0;
163                                                     }
164                                                 }
165                                                 else
166                                                 {
167                                                     $metaPropertyPointer = 0;
168                                                 }
169                                                 if($metaContentBase[$metaContentPointer] == $character) {
170                                                     $metaContentPointer++;
171                                                     if($metaContentPointer == strlen$metaContentBase) {
172                                                         $metaContentPointer = 0;
173                                                         $counter++;
174                                                         while($counter < $contentLength) {
175                                                             $character = $this->webPageContent[$counter];
176                                                             if($character == "\"" || $character == "'") {
177                                                                 if($startPosition == 0) {
178                                                                     $startPosition = $counter + 1;
179                                                                 }
180                                                                 else if($endPosition == 0) {
181                                                                     $endPosition = $counter - 1;
182                                                                     break;
183                                                                 }
184                                                             }
185                                                             $counter++;
186                                                         }
187                                                         $metaValue = $this->collectData($startPosition,$endPosition);
188                                                         $startPosition = 0;
189                                                         $endPosition = 0;
190                                                     }
191                                                 }
192                                                 else
193                                                 {
194                                                     $metaContentPointer = 0;
195                                                 }
196                                                 if($innerMeta == $character) {
197                                                     if($metaType != "" && $metaValue != "") {
198                                                         $result["meta"][$metaType] = $metaValue;
199                                                     }
200                                                     break;
201                                                 }
202                                                 $counter++;
203                                             }
204                                         }
205                                     }
206                                     else
207                                     {
208                                         $metaPointer = 0;
209                                     }
210                                     //--
211                                     //--
212                                     if($innerHead[$innerHeadStart] == $character) {
213                                         $innerHeadStart++;
214                                         if($innerHeadStart == $innerHeadLength) {
215                                             $innerHeadStart = 0;
216                                             break;
217                                         }
218                                     }
219                                     else
220                                     {
221                                         $innerHeadStart = 0;
222                                     }
223                                     //--
224                                     $counter++;
225                                 }
226                                 break;
227                             }
228                         }
229                         else
230                         {
231                             $outerHeadStart = 0;
232                         }
233                         $counter++;
234                     }
235                     //--
236                     
237                     //--
238                     $counter++;
239                     while($counter < $contentLength) {
240                         $character = $this->webPageContent[$counter];
241                         if($character == "") {
242                             $counter++;	
243                             continue;
244                         }
245                         if($hrefTagPointer[$hrefTagLengthStart] == $character) {
246                             $hrefTagLengthStart++;
247                             if($hrefTagLengthStart == $hrefTagLengthFinal) {
248                                 $hrefTagCountStart++;
249                                 if($hrefTagCountStart == $hrefTagCountFinal) {
250                                     $hrefTagCountStart = 0;
251                                     if($previousDirectoryCount > 0 || $currentDirectoryCount > 0 || $singleSlashCount > 0 || $doubleSlashCount > 0) {
252                                         if($previousDirectoryCount > 0) {
253                                             $tempString = "";
254                                             $tempCount = 0;
255                                             $tempTotal = count($urlParser) - $previousDirectoryCount;
256                                             while($tempCount < $tempTotal) {
257                                                 $tempString .= $urlParser[$tempCount]."/";
258                                                 $tempCount++;
259                                             }
260                                             $hrefURL = $tempString.$hrefURL;
261                                         }
262                                         else if($currentDirectoryCount > 0) {
263                                             $hrefURL = $directoryURL."/".$hrefURL;
264                                         }
265                                         else if($singleSlashCount > 0) {
266                                             $hrefURL = $directoryURL."/".$hrefURL;
267                                         }
268                                         else if($doubleSlashCount > 0) {
269                                             $hrefURL = $urlParser[0]."//".$hrefURL;
270                                         }
271                                     }
272                                     if(filter_var$hrefURL,FILTER_VALIDATE_URL == true) {
273                                         $dump = parse_url($hrefURL);
274                                         if($ownHost == $dump["host"]) {
275                                             $result["href"]["own"][] = $hrefURL;
276                                         }
277                                         else
278                                         {
279                                             $result["href"]["other"][] = $hrefURL;
280                                         }
281                                     }
282                                 }
283                                 else if($hrefTagCountStart == 3) {
284                                     //--
285                                     $hrefURL = "";
286                                     $previousDirectoryCount = 0;
287                                     $currentDirectoryCount = 0;
288                                     $singleSlashCount = 0;
289                                     $doubleSlashCount = 0;
290                                     //--
291                     
292                                     $dotCount = 0;
293                                     $slashCount = 0;
294                     
295                                     $firstCharacter = "";
296                                     $leftPosition = 0;
297                                     $rightPosition = 0;
298                                     
299                                     $counter++;
300                                     while($counter < $contentLength) {
301                                         $character = $this->webPageContent[$counter];
302                                         if($character == "\"" || $character == "'") {
303                                             if($firstCharacter == "") {
304                                                 $firstCharacter = $character;
305                                                 $leftPosition = $counter + 1;
306                                             }
307                                             else if($firstCharacter == $character) {
308                                                 $rightPosition = $counter - 1;
309                                                 break;
310                                             }
311                                         }
312                                         else if($character == "#") {
313                                             $hrefTagCountStart = 0;
314                                             break;
315                                         }
316                                         $counter++;
317                                     }
318                                     while($leftPosition <= $rightPosition) {
319                                         $character = $this->webPageContent[$leftPosition];
320                                         if($hrefURL != "") {
321                                             $hrefURL .= $character;
322                                         }
323                                         else if($character == "." || $character == "/") {
324                                             if($character == ".") {
325                                                 $dotCount++;
326                                                 $slashCount = 0;
327                                             }
328                                             else if($character == "/") {
329                                                 $slashCount++;
330                                                 if($dotCount == 2 && $slashCount == 1) {
331                                                     $previousDirectoryCount++;
332                                                 }
333                                                 else if($dotCount == 1 && $slashCount == 1) {
334                                                     $currentDirectoryCount++;
335                                                 }
336                                                 else if($dotCount == 0 && $slashCount == 1) {
337                                                     $singleSlashCount++;
338                                                 }
339                                                 else if($dotCount == 0 && $slashCount == 2) {
340                                                     $singleSlashCount = 0;
341                                                     $doubleSlashCount++;
342                                                 }
343                                             }
344                                         }
345                                         else
346                                         {
347                                             $hrefURL .= $character;
348                                         }
349                                         $leftPosition++;
350                                     }
351                                 }
352                                 $hrefTagLengthStart = 0;
353                                 $hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
354                                 $hrefTagPointer =& $hrefTag[$hrefTagCountStart];
355                             }
356                         }
357                         else
358                         {
359                             $hrefTagLengthStart = 0;
360                         }
361                         if($imgTagPointer[$imgTagLengthStart] == $character) {
362                             $imgTagLengthStart++;
363                             if($imgTagLengthStart == $imgTagLengthFinal) {
364                                 $imgTagCountStart++;
365                                 if($imgTagCountStart == $imgTagCountFinal) {
366                                     $imgTagCountStart = 0;
367                                     if($previousDirectoryCount > 0 || $currentDirectoryCount > 0 || $singleSlashCount > 0 || $doubleSlashCount > 0) {
368                                         if($previousDirectoryCount > 0) {
369                                             $tempString = "";
370                                             $tempCount = 0;
371                                             $tempTotal = count($urlParser) - $previousDirectoryCount;
372                                             while($tempCount < $tempTotal) {
373                                                 $tempString .= $urlParser[$tempCount]."/";
374                                                 $tempCount++;
375                                             }
376                                             $hrefURL = $tempString.$hrefURL;
377                                         }
378                                         else if($currentDirectoryCount > 0) {
379                                             $hrefURL = $directoryURL."/".$hrefURL;
380                                         }
381                                         else if($singleSlashCount > 0) {
382                                             $hrefURL = $directoryURL."/".$hrefURL;
383                                         }
384                                         else if($doubleSlashCount > 0) {
385                                             $hrefURL = $urlParser[0]."//".$hrefURL;
386                                         }
387                                     }
388                                     if(filter_var$hrefURL,FILTER_VALIDATE_URL == true) {
389                                         $result["img"][] = $hrefURL;
390                                     }
391                                 }
392                                 else if($imgTagCountStart == 3) {
393                                     //--
394                                     $hrefURL = "";
395                                     $previousDirectoryCount = 0;
396                                     $currentDirectoryCount = 0;
397                                     $singleSlashCount = 0;
398                                     $doubleSlashCount = 0;
399                                     //--
400                     
401                                     $dotCount = 0;
402                                     $slashCount = 0;
403                     
404                                     $firstCharacter = "";
405                                     $leftPosition = 0;
406                                     $rightPosition = 0;
407                                     
408                                     $counter++;
409                                     while($counter < $contentLength) {
410                                         $character = $this->webPageContent[$counter];
411                                         if($character == "\"" || $character == "'") {
412                                             if($firstCharacter == "") {
413                                                 $firstCharacter = $character;
414                                                 $leftPosition = $counter + 1;
415                                             }
416                                             else if($firstCharacter == $character) {
417                                                 $rightPosition = $counter - 1;
418                                                 break;
419                                             }
420                                         }
421                                         else if($character == "#") {
422                                             $imgTagCountStart = 0;
423                                             break;
424                                         }
425                                         $counter++;
426                                     }
427                                     while($leftPosition <= $rightPosition) {
428                                         $character = $this->webPageContent[$leftPosition];
429                                         if($hrefURL != "") {
430                                             $hrefURL .= $character;
431                                         }
432                                         else if($character == "." || $character == "/") {
433                                             if($character == ".") {
434                                                 $dotCount++;
435                                                 $slashCount = 0;
436                                             }
437                                             else if($character == "/") {
438                                                 $slashCount++;
439                                                 if($dotCount == 2 && $slashCount == 1) {
440                                                     $previousDirectoryCount++;
441                                                 }
442                                                 else if($dotCount == 1 && $slashCount == 1) {
443                                                     $currentDirectoryCount++;
444                                                 }
445                                                 else if($dotCount == 0 && $slashCount == 1) {
446                                                     $singleSlashCount++;
447                                                 }
448                                                 else if($dotCount == 0 && $slashCount == 2) {
449                                                     $singleSlashCount = 0;
450                                                     $doubleSlashCount++;
451                                                 }
452                                             }
453                                         }
454                                         else
455                                         {
456                                             $hrefURL .= $character;
457                                         }
458                                         $leftPosition++;
459                                     }
460                                 }
461                                 $imgTagLengthStart = 0;
462                                 $imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
463                                 $imgTagPointer =& $imgTag[$imgTagCountStart];
464                             }
465                         }
466                         else
467                         {
468                             $imgTagLengthStart = 0;
469                         }
470                         $counter++;
471                     }
472                    //--
473                 }
474             }
475         }
476         return $result;
477     }
478     
479     function collectData($start, $end) {
480         $temp = "";
481         while($start <= $end) {
482             $temp .= $this->webPageContent[$start];
483             $start++;
484         }
485         return $temp;
486     }
487 }
488 ?>

No comments:

Post a Comment