Building a PHP Web Crawler from Scratch
A web crawler, also known as a web spider, is an automated program or script that systematically navigates through websites on the Internet to gather information. Web crawlers start by visiting a seed URL and then follow hyperlinks to other pages, recursively exploring and indexing the content they find.The primary purpose of a web crawler is to gather data from web pages, such as text, images, links, metadata, and other relevant information. This collected data is typically used for various purposes, including web indexing, data mining, content scraping, and search engine optimization. Web crawlers work by sending HTTP requests to web servers, downloading web pages, parsing the HTML or other structured data, and extracting relevant information. They follow the links found on each page to discover new URLs to crawl, creating a vast network of interconnected web pages.You can get the complete code from Github
index.php
1 <?php
2 include_once("config.php");
3 include_once("WebCrawler.php");
4
5 $obj = new WebCrawler();
6 $data = $obj->parser("https://www.algoberry.com");
7 echo "<pre>";
8 print_r($data);
9 echo "</pre>";
10 ?>
config.php
1 <?php
2 //--
3 $outerHead = "<head>";
4 $outerHeadLength = strlen($outerHead);
5 $outerHeadStart = 0;
6
7 $innerHead = "</head>";
8 $innerHeadLength = strlen($innerHead);
9 $innerHeadStart = 0;
10 //--
11
12 //--
13 $outerTitle = "<title>";
14 $outerTitleLength = strlen($outerTitle);
15 $outerTitleStart = 0;
16
17 $innerTitle = "</title>";
18 $innerTitleLength = strlen($innerTitle);
19 $innerTitleStart = 0;
20 //--
21
22 //--
23 $outerMeta = "<meta";
24 $innerMeta = ">";
25 $metaPointer = 0;
26 //--
27
28 //--
29 $metaNameBase = "name=";
30 $metaNamePointer = 0;
31 //--
32
33 //--
34 $metaPropertyBase = "property=";
35 $metaPropertyPointer = 0;
36 //--
37
38 //--
39 $metaContentBase = "content=";
40 $metaContentPointer = 0;
41 //--
42
43 //--
44 $hrefTag = array();
45 $hrefTag[0] = "<a";
46 $hrefTag[1] = "href";
47 $hrefTag[2] = "=";
48 $hrefTag[3] = ">";
49 $hrefTagCountStart = 0;
50 $hrefTagCountFinal = count($hrefTag);
51 $hrefTagLengthStart = 0;
52 $hrefTagLengthFinal = strlen($hrefTag[0]);
53 $hrefTagPointer =& $hrefTag[0];
54 //--
55
56 //--
57 $imgTag = array();
58 $imgTag[0] = "<img";
59 $imgTag[1] = "src";
60 $imgTag[2] = "=";
61 $imgTag[3] = ">";
62 $imgTagCountStart = 0;
63 $imgTagCountFinal = count($imgTag);
64 $imgTagLengthStart = 0;
65 $imgTagLengthFinal = strlen($imgTag[0]);
66 $imgTagPointer =& $imgTag[0];
67 //--
68
69 //--
70 $crawlOptions = array(
71 CURLOPT_RETURNTRANSFER => true, // return web page
72 CURLOPT_HEADER => false, // don't return headers
73 CURLOPT_FOLLOWLOCATION => true, // follow redirects
74 CURLOPT_ENCODING => "", // handle all encodings
75 CURLOPT_USERAGENT => "algoberrybot", // who am i
76 CURLOPT_AUTOREFERER => true, // set referer on redirect
77 CURLOPT_CONNECTTIMEOUT => 10, // timeout on connect
78 CURLOPT_TIMEOUT => 10, // timeout on response
79 CURLOPT_MAXREDIRS => 0 // stop after 10 redirects
80 );
81 //--
82 ?>
WebCrawler.php
1 <?php
2 Class WebCrawler {
3
4 public $webPageContent;
5
6 function parser($directoryURL) {
7 $result = array();
8 global $outerHead, $outerHeadLength, $outerHeadStart;
9 global $innerHead, $innerHeadLength, $innerHeadStart;
10 global $outerTitle, $outerTitleLength, $outerTitleStart;
11 global $innerTitle, $innerTitleLength, $innerTitleStart;
12
13 global $outerMeta, $innerMeta, $metaPointer;
14 global $metaNameBase, $metaNamePointer;
15 global $metaPropertyBase, $metaPropertyPointer;
16 global $metaContentBase, $metaContentPointer;
17
18 global $hrefTag, $hrefTagCountStart, $hrefTagCountFinal, $hrefTagLengthStart, $hrefTagLengthFinal, $hrefTagPointer;
19 global $imgTag, $imgTagCountStart, $imgTagCountFinal, $imgTagLengthStart, $imgTagLengthFinal, $imgTagPointer;
20 global $crawlOptions;
21
22 if($directoryURL != "") {
23 if(filter_var$directoryURL,FILTER_VALIDATE_URL == true) {
24 $hrefURL = "";
25 $imgURL = "";
26 $previousDirectoryCount = 0;
27 $currentDirectoryCount = 0;
28 $singleSlashCount = 0;
29 $doubleSlashCount = 0;
30
31 $urlParser = preg_split("/\//",$directoryURL);
32 $dump = parse_url($directoryURL);
33 $ownHost = $dump["host"];
34
35 $curlObject = curl_init($directoryURL);
36 curl_setopt_array($curlObject,$crawlOptions);
37 $this->webPageContent = curl_exec($curlObject);
38 $errorNumber = curl_errno($curlObject);
39 curl_close($curlObject);
40
41 if($errorNumber == 0) {
42 //--
43 $counter = 0;
44 $contentLength = strlen($this->webPageContent);
45 while($counter < $contentLength) {
46 $character = $this->webPageContent[$counter];
47 if($character == " ") {
48 $counter++;
49 continue;
50 }
51 if($outerHead[$outerHeadStart] == $character) {
52 $outerHeadStart++;
53 if($outerHeadStart == $outerHeadLength) {
54 $outerHeadStart = 0;
55 $counter++;
56 while($counter < $contentLength) {
57 $character = $this->webPageContent[$counter];
58 if($character == " ") {
59 $counter++;
60 continue;
61 }
62 //--
63 if($outerTitle[$outerTitleStart] == $character) {
64 $outerTitleStart++;
65 if($outerTitleStart == $outerTitleLength) {
66 $outerTitleStart = 0;
67 $counter++;
68 $startPosition = $counter;
69 while($counter < $contentLength) {
70 $character = $this->webPageContent[$counter];
71 if($character == " ") {
72 $counter++;
73 continue;
74 }
75 if($innerTitle[$innerTitleStart] == $character) {
76 if($innerTitleStart == 0) {
77 $endPosition = $counter - 1;
78 }
79 $innerTitleStart++;
80 if($innerTitleStart == $innerTitleLength) {
81 $innerTitleStart = 0;
82 $result["title"] = $this->collectData($startPosition,$endPosition);
83 break;
84 }
85 }
86 else
87 {
88 $innerTitleStart = 0;
89 }
90 $counter++;
91 }
92 }
93 }
94 else
95 {
96 $outerTitleStart = 0;
97 }
98 //--
99 //--
100 if($outerMeta[$metaPointer] == $character) {
101 $metaPointer++;
102 if($metaPointer == strlen$outerMeta) {
103 $metaPointer = 0;
104 $startPosition = 0;
105 $endPosition = 0;
106 $metaType = "";
107 $metaValue = "";
108 $counter++;
109 while($counter < $contentLength) {
110 $character = $this->webPageContent[$counter];
111 if($character == " ") {
112 $counter++;
113 continue;
114 }
115 if($metaNameBase[$metaNamePointer] == $character) {
116 $metaNamePointer++;
117 if($metaNamePointer == strlen$metaNameBase) {
118 $metaNamePointer = 0;
119 $counter++;
120 while($counter < $contentLength) {
121 $character = $this->webPageContent[$counter];
122 if($character == "\"" || $character == "'") {
123 if($startPosition == 0) {
124 $startPosition = $counter + 1;
125 }
126 else if($endPosition == 0) {
127 $endPosition = $counter - 1;
128 break;
129 }
130 }
131 $counter++;
132 }
133 $metaType = $this->collectData($startPosition,$endPosition);
134 $startPosition = 0;
135 $endPosition = 0;
136 }
137 }
138 else
139 {
140 $metaNamePointer = 0;
141 }
142 if($metaPropertyBase[$metaPropertyPointer] == $character) {
143 $metaPropertyPointer++;
144 if($metaPropertyPointer == strlen$metaPropertyBase) {
145 $metaPropertyPointer = 0;
146 $counter++;
147 while($counter < $contentLength) {
148 $character = $this->webPageContent[$counter];
149 if($character == "\"" || $character == "'") {
150 if($startPosition == 0) {
151 $startPosition = $counter + 1;
152 }
153 else if($endPosition == 0) {
154 $endPosition = $counter - 1;
155 break;
156 }
157 }
158 $counter++;
159 }
160 $metaType = $this->collectData($startPosition,$endPosition);
161 $startPosition = 0;
162 $endPosition = 0;
163 }
164 }
165 else
166 {
167 $metaPropertyPointer = 0;
168 }
169 if($metaContentBase[$metaContentPointer] == $character) {
170 $metaContentPointer++;
171 if($metaContentPointer == strlen$metaContentBase) {
172 $metaContentPointer = 0;
173 $counter++;
174 while($counter < $contentLength) {
175 $character = $this->webPageContent[$counter];
176 if($character == "\"" || $character == "'") {
177 if($startPosition == 0) {
178 $startPosition = $counter + 1;
179 }
180 else if($endPosition == 0) {
181 $endPosition = $counter - 1;
182 break;
183 }
184 }
185 $counter++;
186 }
187 $metaValue = $this->collectData($startPosition,$endPosition);
188 $startPosition = 0;
189 $endPosition = 0;
190 }
191 }
192 else
193 {
194 $metaContentPointer = 0;
195 }
196 if($innerMeta == $character) {
197 if($metaType != "" && $metaValue != "") {
198 $result["meta"][$metaType] = $metaValue;
199 }
200 break;
201 }
202 $counter++;
203 }
204 }
205 }
206 else
207 {
208 $metaPointer = 0;
209 }
210 //--
211 //--
212 if($innerHead[$innerHeadStart] == $character) {
213 $innerHeadStart++;
214 if($innerHeadStart == $innerHeadLength) {
215 $innerHeadStart = 0;
216 break;
217 }
218 }
219 else
220 {
221 $innerHeadStart = 0;
222 }
223 //--
224 $counter++;
225 }
226 break;
227 }
228 }
229 else
230 {
231 $outerHeadStart = 0;
232 }
233 $counter++;
234 }
235 //--
236
237 //--
238 $counter++;
239 while($counter < $contentLength) {
240 $character = $this->webPageContent[$counter];
241 if($character == "") {
242 $counter++;
243 continue;
244 }
245 if($hrefTagPointer[$hrefTagLengthStart] == $character) {
246 $hrefTagLengthStart++;
247 if($hrefTagLengthStart == $hrefTagLengthFinal) {
248 $hrefTagCountStart++;
249 if($hrefTagCountStart == $hrefTagCountFinal) {
250 $hrefTagCountStart = 0;
251 if($previousDirectoryCount > 0 || $currentDirectoryCount > 0 || $singleSlashCount > 0 || $doubleSlashCount > 0) {
252 if($previousDirectoryCount > 0) {
253 $tempString = "";
254 $tempCount = 0;
255 $tempTotal = count($urlParser) - $previousDirectoryCount;
256 while($tempCount < $tempTotal) {
257 $tempString .= $urlParser[$tempCount]."/";
258 $tempCount++;
259 }
260 $hrefURL = $tempString.$hrefURL;
261 }
262 else if($currentDirectoryCount > 0) {
263 $hrefURL = $directoryURL."/".$hrefURL;
264 }
265 else if($singleSlashCount > 0) {
266 $hrefURL = $directoryURL."/".$hrefURL;
267 }
268 else if($doubleSlashCount > 0) {
269 $hrefURL = $urlParser[0]."//".$hrefURL;
270 }
271 }
272 if(filter_var$hrefURL,FILTER_VALIDATE_URL == true) {
273 $dump = parse_url($hrefURL);
274 if($ownHost == $dump["host"]) {
275 $result["href"]["own"][] = $hrefURL;
276 }
277 else
278 {
279 $result["href"]["other"][] = $hrefURL;
280 }
281 }
282 }
283 else if($hrefTagCountStart == 3) {
284 //--
285 $hrefURL = "";
286 $previousDirectoryCount = 0;
287 $currentDirectoryCount = 0;
288 $singleSlashCount = 0;
289 $doubleSlashCount = 0;
290 //--
291
292 $dotCount = 0;
293 $slashCount = 0;
294
295 $firstCharacter = "";
296 $leftPosition = 0;
297 $rightPosition = 0;
298
299 $counter++;
300 while($counter < $contentLength) {
301 $character = $this->webPageContent[$counter];
302 if($character == "\"" || $character == "'") {
303 if($firstCharacter == "") {
304 $firstCharacter = $character;
305 $leftPosition = $counter + 1;
306 }
307 else if($firstCharacter == $character) {
308 $rightPosition = $counter - 1;
309 break;
310 }
311 }
312 else if($character == "#") {
313 $hrefTagCountStart = 0;
314 break;
315 }
316 $counter++;
317 }
318 while($leftPosition <= $rightPosition) {
319 $character = $this->webPageContent[$leftPosition];
320 if($hrefURL != "") {
321 $hrefURL .= $character;
322 }
323 else if($character == "." || $character == "/") {
324 if($character == ".") {
325 $dotCount++;
326 $slashCount = 0;
327 }
328 else if($character == "/") {
329 $slashCount++;
330 if($dotCount == 2 && $slashCount == 1) {
331 $previousDirectoryCount++;
332 }
333 else if($dotCount == 1 && $slashCount == 1) {
334 $currentDirectoryCount++;
335 }
336 else if($dotCount == 0 && $slashCount == 1) {
337 $singleSlashCount++;
338 }
339 else if($dotCount == 0 && $slashCount == 2) {
340 $singleSlashCount = 0;
341 $doubleSlashCount++;
342 }
343 }
344 }
345 else
346 {
347 $hrefURL .= $character;
348 }
349 $leftPosition++;
350 }
351 }
352 $hrefTagLengthStart = 0;
353 $hrefTagLengthFinal = strlen($hrefTag[$hrefTagCountStart]);
354 $hrefTagPointer =& $hrefTag[$hrefTagCountStart];
355 }
356 }
357 else
358 {
359 $hrefTagLengthStart = 0;
360 }
361 if($imgTagPointer[$imgTagLengthStart] == $character) {
362 $imgTagLengthStart++;
363 if($imgTagLengthStart == $imgTagLengthFinal) {
364 $imgTagCountStart++;
365 if($imgTagCountStart == $imgTagCountFinal) {
366 $imgTagCountStart = 0;
367 if($previousDirectoryCount > 0 || $currentDirectoryCount > 0 || $singleSlashCount > 0 || $doubleSlashCount > 0) {
368 if($previousDirectoryCount > 0) {
369 $tempString = "";
370 $tempCount = 0;
371 $tempTotal = count($urlParser) - $previousDirectoryCount;
372 while($tempCount < $tempTotal) {
373 $tempString .= $urlParser[$tempCount]."/";
374 $tempCount++;
375 }
376 $hrefURL = $tempString.$hrefURL;
377 }
378 else if($currentDirectoryCount > 0) {
379 $hrefURL = $directoryURL."/".$hrefURL;
380 }
381 else if($singleSlashCount > 0) {
382 $hrefURL = $directoryURL."/".$hrefURL;
383 }
384 else if($doubleSlashCount > 0) {
385 $hrefURL = $urlParser[0]."//".$hrefURL;
386 }
387 }
388 if(filter_var$hrefURL,FILTER_VALIDATE_URL == true) {
389 $result["img"][] = $hrefURL;
390 }
391 }
392 else if($imgTagCountStart == 3) {
393 //--
394 $hrefURL = "";
395 $previousDirectoryCount = 0;
396 $currentDirectoryCount = 0;
397 $singleSlashCount = 0;
398 $doubleSlashCount = 0;
399 //--
400
401 $dotCount = 0;
402 $slashCount = 0;
403
404 $firstCharacter = "";
405 $leftPosition = 0;
406 $rightPosition = 0;
407
408 $counter++;
409 while($counter < $contentLength) {
410 $character = $this->webPageContent[$counter];
411 if($character == "\"" || $character == "'") {
412 if($firstCharacter == "") {
413 $firstCharacter = $character;
414 $leftPosition = $counter + 1;
415 }
416 else if($firstCharacter == $character) {
417 $rightPosition = $counter - 1;
418 break;
419 }
420 }
421 else if($character == "#") {
422 $imgTagCountStart = 0;
423 break;
424 }
425 $counter++;
426 }
427 while($leftPosition <= $rightPosition) {
428 $character = $this->webPageContent[$leftPosition];
429 if($hrefURL != "") {
430 $hrefURL .= $character;
431 }
432 else if($character == "." || $character == "/") {
433 if($character == ".") {
434 $dotCount++;
435 $slashCount = 0;
436 }
437 else if($character == "/") {
438 $slashCount++;
439 if($dotCount == 2 && $slashCount == 1) {
440 $previousDirectoryCount++;
441 }
442 else if($dotCount == 1 && $slashCount == 1) {
443 $currentDirectoryCount++;
444 }
445 else if($dotCount == 0 && $slashCount == 1) {
446 $singleSlashCount++;
447 }
448 else if($dotCount == 0 && $slashCount == 2) {
449 $singleSlashCount = 0;
450 $doubleSlashCount++;
451 }
452 }
453 }
454 else
455 {
456 $hrefURL .= $character;
457 }
458 $leftPosition++;
459 }
460 }
461 $imgTagLengthStart = 0;
462 $imgTagLengthFinal = strlen($imgTag[$imgTagCountStart]);
463 $imgTagPointer =& $imgTag[$imgTagCountStart];
464 }
465 }
466 else
467 {
468 $imgTagLengthStart = 0;
469 }
470 $counter++;
471 }
472 //--
473 }
474 }
475 }
476 return $result;
477 }
478
479 function collectData($start, $end) {
480 $temp = "";
481 while($start <= $end) {
482 $temp .= $this->webPageContent[$start];
483 $start++;
484 }
485 return $temp;
486 }
487 }
488 ?>
No comments:
Post a Comment