diff options
| -rw-r--r-- | controllers/archive_page.php | 60 | ||||
| -rw-r--r-- | models/webpage.php | 9 |
2 files changed, 52 insertions, 17 deletions
diff --git a/controllers/archive_page.php b/controllers/archive_page.php index ef0f50d..b8116b6 100644 --- a/controllers/archive_page.php +++ b/controllers/archive_page.php @@ -17,20 +17,23 @@ class DownloadPage { $this->folder_location = $folder_location; $this->page_url = $page_url; list($website_exists, $this->page_url) = $this->doesWebsiteExist($this->page_url); - // TODO: Make a clause for whether the same site was already archived at least once - // This should happen with a request to the database - // If such site exists then when downloading the resources check whether some of the resources already exist in the - // old archive - // If they do dont download them (or rather delete and make the pointers point to the correct archive folder + // Search for all the regexes that fit the *url* pattern where the pattern is the requested url but without the protocol + $page_url_pattern = $this->getCorrectLinkPattern($page_url); + $simular_pages = Database\Webpage::getArchivePathsByPattern('%' . $page_url_pattern . '%'); if ($website_exists) { $this->folder_name = Database\Webpage::create($folder_location, $page_url, 1); $this->page_contents = $this->downloadFile($this->page_url); - $this->createArchive(); + $this->createArchive($simular_pages); } else { echo "Website does not exist"; } } + function getCorrectLinkPattern($page_url) : string { + $page_url = substr($page_url, strpos($page_url, "//"), strlen($page_url)); + return $page_url; + } + function setFolderLocation($folder_location) : void { $this->folder_location = $folder_location; } @@ -40,7 +43,7 @@ class DownloadPage { function setPageUrl($page_url) : void { $this->page_url = $page_url; } - function applyCorrectProtocol($url, $protocol) : void { + function applyCorrectProtocol($url, $protocol) : string { if (str_contains($url, $protocol)) { return $url; } @@ -57,7 +60,7 @@ class DownloadPage { return $page_contents; } - function doesWebsiteExist($url) : array(bool, string) { + function doesWebsiteExist($url) : array { // Check if the site exists with https $https_url = $this->applyCorrectProtocol($url, "https://"); if ($https_url != $url) { @@ -95,7 +98,7 @@ class DownloadPage { return rtrim($baseUrl, '/') . '/' . ltrim($relativeUrl, '/'); } - function downloadSource(&$dom, $folder_path, $tagName, $attribute) : void { + function downloadSource(&$dom, $folder_path, $tagName, $attribute, $simular_pages) : void { $links = $dom->getElementsByTagName($tagName); foreach($links as $link) { $source = $link->getAttribute($attribute); @@ -104,10 +107,33 @@ class DownloadPage { if ($this->isResourceAccessible($sourceUrl)) { $sourceContent = $this->downloadFile($sourceUrl); if ($sourceContent) { - $link->setAttribute($attribute, './' . basename($source)); - $file = fopen($folder_path . '/' . basename($source), "w"); - fwrite($file, $sourceContent); - fclose($file); + $found_resource = false; + if (count($simular_pages) != 0) { + // Page is not unique so check if any other already downloaded resource is + // the same as the resource that is needed thus not actually needing to download it + foreach($simular_pages as $page) { + $resourceName = basename($source); + if (!file_exists($this->folder_location . "/" . $page->WID . "/" . $resourceName)) { + continue; + } + $resourceContents = file_get_contents($this->folder_location . "/" . $page->WID . "/" . $resourceName); + if (strlen($resourceContents) == strlen($sourceContent) && md5($resourceContents) == md5($sourceContent)) { + // They are the same resource + // change the link to point to the source of the previous archive instead of downloading a news source + $link->setAttribute($attribute, "../" . $page->WID . "/" . $resourceName); + $found_resource = true; + break; + } + } + } + + if (!$found_resource) { + // Page is unique so there will be no resource that can be cached + $link->setAttribute($attribute, './' . basename($source)); + $file = fopen($folder_path . '/' . basename($source), "w"); + fwrite($file, $sourceContent); + fclose($file); + } } } } @@ -125,7 +151,7 @@ class DownloadPage { return ($code >= 200 && $code < 400); } - function createArchive() : void { + function createArchive($simular_pages) : void { // Creates the folder with the correct resources and the main html page in a index.html tag $dom = new DOMDocument(); @$dom->loadHTML($this->page_contents); // This suppresses warnings for invalid HTML @@ -135,9 +161,9 @@ class DownloadPage { mkdir($folder_path, 0777, true); } - $this->downloadSource($dom, $folder_path, 'link', 'href'); - $this->downloadSource($dom, $folder_path, 'script', 'src'); - $this->downloadSource($dom, $folder_path, 'img', 'src'); + $this->downloadSource($dom, $folder_path, 'link', 'href', $simular_pages); + $this->downloadSource($dom, $folder_path, 'script', 'src', $simular_pages); + $this->downloadSource($dom, $folder_path, 'img', 'src', $simular_pages); $this->page_contents = $dom->saveHTML(); $indexFile = fopen($folder_path . '/index.html', "w"); diff --git a/models/webpage.php b/models/webpage.php index 46c8407..1a2f4a6 100644 --- a/models/webpage.php +++ b/models/webpage.php @@ -42,6 +42,15 @@ class Webpage extends Table { ); } + static function getArchivePathsByPattern(string $URLPattern) : array { + return Table::_get_all( + 'Webpages', + 'Database\Webpage', + "WHERE URL LIKE \"$URLPattern\" ORDER BY Date DESC", + "Path, WID" + ); + } + function incrementVisits() { Table::_update( 'Webpages', |
