aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Nikolov <ggeorgi60@gmail.com>2025-01-19 12:56:55 +0200
committerGeorgi Nikolov <ggeorgi60@gmail.com>2025-01-19 12:56:55 +0200
commit98f3961739ee4ab7c79c1e0ae5131b85a2e24198 (patch)
tree903008132516ea0caf71f8be23870c45000006ab
parentd391a2cbf0cd59ceace2cabd042330e9e1dea4ee (diff)
downloadnowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar
nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.tar.gz
nowayforward_human-98f3961739ee4ab7c79c1e0ae5131b85a2e24198.zip
Added downloading of all the resources for a site as well as the site itself this ensuring that the site will load correcly offline
This needed a change in download methods so curl is now used instead of the 'file_get_contents' function
-rw-r--r--controllers/archive_page.php58
1 files changed, 56 insertions, 2 deletions
diff --git a/controllers/archive_page.php b/controllers/archive_page.php
index 7f62d7b..73875cd 100644
--- a/controllers/archive_page.php
+++ b/controllers/archive_page.php
@@ -19,7 +19,7 @@ class DownloadPage {
list($website_exists, $this->page_url) = $this->does_website_exist($this->page_url);
if ($website_exists) {
$this->zip_name = Database\Webpage::create($zip_location, $page_url, 1) . '.zip';
- $this->page_contents = file_get_contents($this->page_url);
+ $this->page_contents = $this->download_file($this->page_url);
$zip = $this->create_zip_archive();
} else {
echo "Website does not exist";
@@ -43,8 +43,16 @@ class DownloadPage {
return $protocol . $url;
}
- function does_website_exist($url) {
+ function download_file($url) {
+ $curl_func = curl_init($url);
+ curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true);
+ $page_contents = curl_exec($curl_func);
+ curl_close($curl_func);
+ return $page_contents;
+ }
+ function does_website_exist($url) {
// Check if the site exists with https
$https_url = $this->apply_correct_protocol($url, "https://");
if ($https_url != $url) {
@@ -73,10 +81,56 @@ class DownloadPage {
return array(false, $url);
}
+ function resolveUrl($relativeUrl, $baseUrl) {
+ // If the url is already absolute return it
+ if (parse_url($relativeUrl, PHP_URL_SCHEME)) {
+ return $relativeUrl;
+ }
+ // Otherwise resolve it agains the base url
+ return rtrim($baseUrl, '/') . '/' . ltrim($relativeUrl, '/');
+ }
+
+ function download_source(&$dom, &$zip, $tagName, $attribute) {
+ $links = $dom->getElementsByTagName($tagName);
+ foreach($links as $link) {
+ $source = $link->getAttribute($attribute);
+ if ($source) {
+ $sourceUrl = $this->resolveUrl($source, $this->page_url);
+ if ($this->is_resource_accessible($sourceUrl)) {
+ $sourceContent = $this->download_file($sourceUrl);
+ if ($sourceContent) {
+ $link->setAttribute($attribute, $sourceUrl);
+ $zip->addFromString(basename($source), $sourceContent);
+ }
+ }
+ }
+ }
+ }
+
+ function is_resource_accessible($url) {
+ $curl_func = curl_init($url);
+ curl_setopt($curl_func, CURLOPT_NOBODY, true); // Gives only the headers
+ curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true);
+ curl_exec($curl_func);
+ $code = curl_getinfo($curl_func, CURLINFO_HTTP_CODE);
+ curl_close($curl_func);
+ return ($code >= 200 && $code < 400);
+ }
+
function create_zip_archive() {
// Creates and returns a zip object resulted from zipping the page that was downloaded
$zip = new ZipArchive();
if ($zip->open($this->zip_location . '/' . $this->zip_name, ZipArchive::CREATE) === TRUE) {
+
+ $dom = new DOMDocument();
+ @$dom->loadHTML($this->page_contents); // This suppresses warnings for invalid HTML
+
+ $this->download_source($dom, $zip, 'link', 'href');
+ $this->download_source($dom, $zip, 'script', 'src');
+ $this->download_source($dom, $zip, 'img', 'src');
+
+ $this->page_contents = $dom->saveHTML();
$zip->addFromString('index.html', $this->page_contents);
$zip->close();
echo "Archived {$this->page_url}";