Skip to main content
5 of 5
Update wording, capitalize acronyms, add tags

PHP local document crawler

I've written a quick "local document crawler" that fetches the title tag and an expandable amount of metatag information from files on a webserver.

I develop in .NET for a living and don't have a clue what I'm doing, but the site I'm helping with only has PHP hosting.

The goal is to gather metadata from files on a server, hopefully cache the output that uses the data, and display it to the user.

We experienced some x-files stuff when the first cache-file was written, and the system itself is rather slow, even when not recursing. (There are about 200 files being read in a request) The x-files stuff being PHP files disappearing from FTP view, which might be due to permissions being automatically set by the hosting provider.

Another thing I really don't understand is why some pages just don't seem to match my regular expressions for the metatags, so if anyone spots the issue you have my thanks.

General class:

<?php
class MetaEnumerator
{
    private $patterns = array(
            "title" => "/<title>([^<]*)<\\x2Ftitle>/ix",
            "keywords" => '/<meta(?=[^>]*name="keywords")\s[^>$]*content="([^"]*)[">]*$/ixu',
            "description" => '/<meta(?=[^>]*name="description")\s[^>$]*content="([^"]*)[">]*$/ixu'
        );
    private $endPattern = "/<\/head>/ixu";

    private $path = "";
    private $recursive = false;
    private $files = null;

    function __construct($path, $recursive) {
        $this->path = $path;
        $this->recursive = $recursive;
    }

    public function AddPattern($key, $pattern)
    {
        $this->patterns[$key] = $pattern;
    }

    public function GetFiles()
    {
        $this->files = array();
        $this->AddItems($this->path);
        usort($this->files, array("MetaEnumerator", "CompareTitle"));
        return $this->files;
   }

    private static function CompareTitle($a, $b) {
        return strcmp($a["title"], $b["title"]);
    }

    private function AddItems($path)
    {
        foreach(scandir($path) as $item) {
            $this->AddItem($path, $item);
        }
    }

    private function AddItem($path, $item)
    {
        $fullPath = "$path/$item";
        if ($this->IsFolder($fullPath, $item) && $this->recursive) {
            $this->AddItems($fullPath);
        }
        else if ($this->IsHtmlFile($fullPath)) {
            $this->AddFile($fullPath);
        }
    }

    private function AddFile($fullPath)
    {
        $fileInfo = $this->GetFileInfo($fullPath);
        array_push($this->files, $fileInfo);
    }

    private function GetFileInfo($file)
    {
        $fileInfo = array();
        $fileInfo["path"] = $file;
        $fileInfo["modified"] = filemtime($file);
        $ptr = fopen($file, "r");
        foreach ($this->patterns as $key => $value) {
            $fileInfo[$key] = $this->FindPattern($ptr, $value);
        }
        fclose($ptr);
        return $fileInfo;
    }

    private function FindPattern($ptr, $pattern)
    {
        $retVal = "";
        rewind($ptr);
        while (($line = fgets($ptr)) !== FALSE) {
            if (preg_match($pattern, $line) > 0) {
                $retVal = preg_replace($pattern, "$1", $line);
                break;
            }
            if (preg_match($this->endPattern, $line) > 0) {
                break;
            }
        }
        return $retVal;
    }

    private function IsFolder($path, $item)
    {
        return is_dir($path) && $this->IsPhysical($item);
    }

    private function IsPhysical($folderPath) {
        return $folderPath !== "." && $folderPath !== "..";
    }

    private function IsHtmlFile($filePath)
    {
        $pathInfo = pathinfo($filePath);
        return !is_dir($filePath) && $pathInfo["extension"] == "html";
    }
}

A page using it:
(This hasn't been refactored yet, so lay off with the clean code comments.)

<?
include "../../../utils/MetaEnumerator.php";

$files = scandir("..");
$maxDate = null;
foreach($files as $file) {
    $date = filemtime("../$file");
    if ($maxDate == null || $date > $maxDate) {
        $maxDate = $date;
    }
}

$cacheFile = "thispage.cache";
$cacheDate = file_exists($cacheFile) ? filemtime($cacheFile)  : null;
if ($cacheDate >= $maxDate) {
    include($cacheFile);
    exit;
}
else
{
    ob_start();
?>
<html>
<head>
    <title>Our stuff</title>
</head>
<body>
<?
    echo date("d.m.Y",$maxDate);

    function AddTag($enumerator, $name) {
        $metaPrefix = '/<meta(?=[^>]*name="';
        $metaSuffix = '")\s[^>$]*content="([^"]*)[">]*$/ixu';
        $enumerator->AddPattern($name, $metaPrefix.$name.$metaSuffix);
    }

    $enumerator = new MetaEnumerator("..", false);
    AddTag($enumerator, "name");
    AddTag($enumerator, "country");
    AddTag($enumerator, "status");
    AddTag($enumerator, "active");
    $files = $enumerator->GetFiles();

    echo "<table>";
    echo "<tr>";
    echo "<th>Name</th>".
        "<th>Country</th>".
        "<th>Status</th>".
        "<th>Last update</th>";
    echo "</tr>";
    foreach($files as $file) {
        if ($file["name"] == null) continue;
        echo "<tr style=\"vertical-align: top;\">";
        echo "<td><a href=\"".$file["path"]."\" target=\"_blank\">".$file["name"]."</a></td>".
            "<td>".$file["country"]."</td>".
            "<td>".$file["eruption"]."</td>".
            "<td>".date("d.m.Y", $file["modified"])."</td>";
        echo "</tr>";
    }
    echo "</table>";
?>
</body>
</html>
<?
$fp = fopen($cacheFile, 'w');
fwrite($fp, ob_get_contents());
fclose($fp);
ob_end_flush();
}
?>
Lars-Erik
  • 1.4k
  • 9
  • 12