Added downloading of all the resources for a site as well as the site itself this ensuring that the site will load correcly offline

This needed a change in download methods so curl is now used instead of the 'file_get_contents' function
author: Georgi Nikolov <ggeorgi60@gmail.com> 2025-01-19 12:56:55 +0200
committer: Georgi Nikolov <ggeorgi60@gmail.com> 2025-01-19 12:56:55 +0200
commit: 98f3961739ee4ab7c79c1e0ae5131b85a2e24198 (patch)
tree: 903008132516ea0caf71f8be23870c45000006ab
parent: d391a2cbf0cd59ceace2cabd042330e9e1dea4ee (diff)
download: nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar
nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar.gz
nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.zip
1 files changed, 56 insertions, 2 deletions
diff --git a/controllers/archive_page.php b/controllers/archive_page.php
index 7f62d7b..73875cd 100644
--- a/controllers/archive_page.php
+++ b/controllers/archive_page.php
@@ -19,7 +19,7 @@ class DownloadPage {
         list($website_exists, $this->page_url) = $this->does_website_exist($this->page_url);
         if ($website_exists) {
             $this->zip_name = Database\Webpage::create($zip_location, $page_url, 1) . '.zip';
-            $this->page_contents = file_get_contents($this->page_url);
+            $this->page_contents = $this->download_file($this->page_url);
             $zip = $this->create_zip_archive();
         } else {
             echo "Website does not exist";
@@ -43,8 +43,16 @@ class DownloadPage {
         return $protocol . $url;
     }
 
-    function does_website_exist($url) {
+    function download_file($url) {
+        $curl_func = curl_init($url);
+        curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true);
+        $page_contents = curl_exec($curl_func);
+        curl_close($curl_func);
+        return $page_contents;
+    }
 
+    function does_website_exist($url) {
         // Check if the site exists with https
         $https_url = $this->apply_correct_protocol($url, "https://");
         if ($https_url != $url) {
@@ -73,10 +81,56 @@ class DownloadPage {
         return array(false, $url);
     }
 
+    function resolveUrl($relativeUrl, $baseUrl) {
+        // If the url is already absolute return it
+        if (parse_url($relativeUrl, PHP_URL_SCHEME)) {
+            return $relativeUrl;
+        }
+        // Otherwise resolve it agains the base url
+        return rtrim($baseUrl, '/') . '/' . ltrim($relativeUrl, '/');
+    }
+
+    function download_source(&$dom, &$zip, $tagName, $attribute) {
+        $links = $dom->getElementsByTagName($tagName);
+        foreach($links as $link) {
+            $source = $link->getAttribute($attribute);
+            if ($source) {
+                $sourceUrl = $this->resolveUrl($source, $this->page_url);
+                if ($this->is_resource_accessible($sourceUrl)) {
+                    $sourceContent = $this->download_file($sourceUrl);
+                    if ($sourceContent) {
+                        $link->setAttribute($attribute, $sourceUrl);
+                        $zip->addFromString(basename($source), $sourceContent);
+                    }
+                }
+            }
+        }
+    }
+
+    function is_resource_accessible($url) {
+        $curl_func = curl_init($url);
+        curl_setopt($curl_func, CURLOPT_NOBODY, true); // Gives only the headers
+        curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true);
+        curl_exec($curl_func);
+        $code = curl_getinfo($curl_func, CURLINFO_HTTP_CODE);
+        curl_close($curl_func);
+        return ($code >= 200 && $code < 400);
+    }
+
     function create_zip_archive() {
         // Creates and returns a zip object resulted from zipping the page that was downloaded
         $zip = new ZipArchive();
         if ($zip->open($this->zip_location . '/' . $this->zip_name, ZipArchive::CREATE) === TRUE) {
+
+            $dom = new DOMDocument();
+            @$dom->loadHTML($this->page_contents); // This suppresses warnings for invalid HTML
+
+            $this->download_source($dom, $zip, 'link', 'href');
+            $this->download_source($dom, $zip, 'script', 'src');
+            $this->download_source($dom, $zip, 'img', 'src');
+
+            $this->page_contents = $dom->saveHTML();
             $zip->addFromString('index.html', $this->page_contents);
             $zip->close();
             echo "Archived {$this->page_url}";
author	Georgi Nikolov <ggeorgi60@gmail.com>	2025-01-19 12:56:55 +0200
committer	Georgi Nikolov <ggeorgi60@gmail.com>	2025-01-19 12:56:55 +0200
commit	98f3961739ee4ab7c79c1e0ae5131b85a2e24198 (patch)
tree	903008132516ea0caf71f8be23870c45000006ab
parent	d391a2cbf0cd59ceace2cabd042330e9e1dea4ee (diff)
download	nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar.gz nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.zip