diff options
| author | Georgi Nikolov <ggeorgi60@gmail.com> | 2025-01-25 14:14:29 +0200 |
|---|---|---|
| committer | Georgi Nikolov <ggeorgi60@gmail.com> | 2025-01-25 14:14:29 +0200 |
| commit | 46e75813c66807fd0297cbf3514a1a5f549d604a (patch) | |
| tree | 02712cc62a15b447cbb2fa164a653455416e7d1b | |
| parent | 04b77a264f8d7fbc9069ad96fa1a448f1f3ecb12 (diff) | |
| download | nowayforward_human-46e75813c66807fd0297cbf3514a1a5f549d604a.tar nowayforward_human-46e75813c66807fd0297cbf3514a1a5f549d604a.tar.gz nowayforward_human-46e75813c66807fd0297cbf3514a1a5f549d604a.zip | |
Handled the hyperlinks in the webpage that point to different sites
| -rw-r--r-- | controllers/archive_page.php | 54 |
1 files changed, 51 insertions, 3 deletions
diff --git a/controllers/archive_page.php b/controllers/archive_page.php index b8116b6..fddea57 100644 --- a/controllers/archive_page.php +++ b/controllers/archive_page.php @@ -13,6 +13,14 @@ class DownloadPage { private $page_url; private $page_contents; + private function debugPrintToConsole($data) : void{ + $output = $data; + if (is_array($output)) + $output = implode(',', $output); + + echo "<script>console.log('Debug Objects: " . $output . "' );</script>"; + } + function __construct($page_url, $folder_location) { $this->folder_location = $folder_location; $this->page_url = $page_url; @@ -30,7 +38,8 @@ class DownloadPage { } function getCorrectLinkPattern($page_url) : string { - $page_url = substr($page_url, strpos($page_url, "//"), strlen($page_url)); + // NOTE: Offset by 2 because of the '//' of the protocol + $page_url = substr($page_url, strpos($page_url, "//") + 2, strlen($page_url)); return $page_url; } @@ -55,6 +64,7 @@ class DownloadPage { $curl_func = curl_init($url); curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl_func, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; compatible; pageburst) Chrome/131.0.6778.204 Safari/537.36"); $page_contents = curl_exec($curl_func); curl_close($curl_func); return $page_contents; @@ -131,8 +141,10 @@ class DownloadPage { // Page is unique so there will be no resource that can be cached $link->setAttribute($attribute, './' . basename($source)); $file = fopen($folder_path . '/' . basename($source), "w"); - fwrite($file, $sourceContent); - fclose($file); + if ($file){ + fwrite($file, $sourceContent); + fclose($file); + } } } } @@ -140,6 +152,40 @@ class DownloadPage { } } + // Changes the hyperlinks in the site to ones that are local for the site + // or to the landing page when a page is not archived if the hyperlink of the + // other page is not archived + function changeHyperlinkToLocal(&$dom, $tagName, $attribute) : void { + $tags = $dom->getElementsByTagName($tagName); + foreach($tags as $tag) { + $link = $tag->getAttribute($attribute); + // Make a request to the db and check if any URLs like the 'link' + // exist in it and are presently donwloaded + //$link_url = $this->resolveUrl($link); + $page_url_pattern = $this->getCorrectLinkPattern($link); + // TODO: The link should depend on whether there is a domain in the front or not + $correct_results = Database\Webpage::getArchivePathsByPattern('%' . $page_url_pattern . '%'); + + if (count($correct_results) != 0) { + // If there are any links that are the same as the urls make the $dom attribute point + // to the latest version of that page + $tag->setAttribute($attribute, "../" . $correct_results[0]->WID . "/index.html"); + } else { + // If there are no pages that are like that url point to the landing page of the site + // that says that this page was not yet archived + $tag->setAttribute($attribute, "../../archive/index.php?page_url=" . $this->baseToFullUrlForGet($this->page_url, $link)); + $this->debugPrintToConsole($this->baseToFullUrlForGet($this->page_url, $link)); + } + } + } + + function baseToFullUrlForGet($url, $base) : string { + $replaced = rtrim($url, '/') . '/' . ltrim($base, '/'); + $replaced = str_replace('/', '%2F', $replaced); + $replaced = str_replace(':', '%3A', $replaced); + return $replaced; + } + function isResourceAccessible($url) : bool { $curl_func = curl_init($url); curl_setopt($curl_func, CURLOPT_NOBODY, true); // Gives only the headers @@ -165,6 +211,8 @@ class DownloadPage { $this->downloadSource($dom, $folder_path, 'script', 'src', $simular_pages); $this->downloadSource($dom, $folder_path, 'img', 'src', $simular_pages); + $this->changeHyperlinkToLocal($dom, 'a', 'href'); + $this->page_contents = $dom->saveHTML(); $indexFile = fopen($folder_path . '/index.html', "w"); fwrite($indexFile, $this->page_contents); |
