diff options
Diffstat (limited to 'controllers')
| -rw-r--r-- | controllers/archive_page.php | 58 |
1 files changed, 56 insertions, 2 deletions
diff --git a/controllers/archive_page.php b/controllers/archive_page.php index 7f62d7b..73875cd 100644 --- a/controllers/archive_page.php +++ b/controllers/archive_page.php @@ -19,7 +19,7 @@ class DownloadPage { list($website_exists, $this->page_url) = $this->does_website_exist($this->page_url); if ($website_exists) { $this->zip_name = Database\Webpage::create($zip_location, $page_url, 1) . '.zip'; - $this->page_contents = file_get_contents($this->page_url); + $this->page_contents = $this->download_file($this->page_url); $zip = $this->create_zip_archive(); } else { echo "Website does not exist"; @@ -43,8 +43,16 @@ class DownloadPage { return $protocol . $url; } - function does_website_exist($url) { + function download_file($url) { + $curl_func = curl_init($url); + curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true); + $page_contents = curl_exec($curl_func); + curl_close($curl_func); + return $page_contents; + } + function does_website_exist($url) { // Check if the site exists with https $https_url = $this->apply_correct_protocol($url, "https://"); if ($https_url != $url) { @@ -73,10 +81,56 @@ class DownloadPage { return array(false, $url); } + function resolveUrl($relativeUrl, $baseUrl) { + // If the url is already absolute return it + if (parse_url($relativeUrl, PHP_URL_SCHEME)) { + return $relativeUrl; + } + // Otherwise resolve it agains the base url + return rtrim($baseUrl, '/') . '/' . ltrim($relativeUrl, '/'); + } + + function download_source(&$dom, &$zip, $tagName, $attribute) { + $links = $dom->getElementsByTagName($tagName); + foreach($links as $link) { + $source = $link->getAttribute($attribute); + if ($source) { + $sourceUrl = $this->resolveUrl($source, $this->page_url); + if ($this->is_resource_accessible($sourceUrl)) { + $sourceContent = $this->download_file($sourceUrl); + if ($sourceContent) { + $link->setAttribute($attribute, $sourceUrl); + $zip->addFromString(basename($source), $sourceContent); + } + } + } + } + } + + function is_resource_accessible($url) { + $curl_func = curl_init($url); + curl_setopt($curl_func, CURLOPT_NOBODY, true); // Gives only the headers + curl_setopt($curl_func, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl_func, CURLOPT_FOLLOWLOCATION, true); + curl_exec($curl_func); + $code = curl_getinfo($curl_func, CURLINFO_HTTP_CODE); + curl_close($curl_func); + return ($code >= 200 && $code < 400); + } + function create_zip_archive() { // Creates and returns a zip object resulted from zipping the page that was downloaded $zip = new ZipArchive(); if ($zip->open($this->zip_location . '/' . $this->zip_name, ZipArchive::CREATE) === TRUE) { + + $dom = new DOMDocument(); + @$dom->loadHTML($this->page_contents); // This suppresses warnings for invalid HTML + + $this->download_source($dom, $zip, 'link', 'href'); + $this->download_source($dom, $zip, 'script', 'src'); + $this->download_source($dom, $zip, 'img', 'src'); + + $this->page_contents = $dom->saveHTML(); $zip->addFromString('index.html', $this->page_contents); $zip->close(); echo "Archived {$this->page_url}"; |
