<?php

// This is a PHP script that handles offloading of bandwidth from a web
//  server. It's a sort of poor-man's Akamai. It doesn't need anything
//  terribly complex (Apache, PHP, a writable directory, and some small
//  modules).
//
// It works like this:
//  - You have a webserver with dynamic content, and static content that
//    may change arbitrarily (i.e. - various users making changes to their
//    homepages, etc). This server is under a lot of load, mostly from
//    the static content, which tends to be big. We call this the "base"
//    server.
//  - You have another webserver that you can use to offload some of the
//    bandwidth. We call this the "offload" server.
//  - You set up an Apache module (mod_offload) on the first server.
//    mod_offload inserts itself into the request chain, and decides if a
//    given file is safe static content (real file, not a script/cgi, no
//    password). In those cases, it sends a 302 redirect to the offload
//    server.
//  - The offload server gets a request from the redirected client. It then
//    sends an HTTP HEAD request for the file in question to the base server
//    while the client waits. It decides if it has the right file based on
//    the HEAD. If it does, it serves the cached file.
//  - If the file is out of date, or doesn't exist on the offload server, it
//    sends a regular HTTP request for it to the base server and
//    begins caching it. While caching it, it also feeds it to the client
//    that has been waiting.
//  - If another request comes in while the file is being cached, it'll will
//    stream what is already there from disk, and then continue to feed as
//    the rest shows up.


// !!! FIXME:  issues to work out.
//   - Could have a partial file cached if server crashes or power goes out.
//     Add a "cacher's process id" to the metadata, and have those feeding
//     from the cache decide if this process died...if so, wipe the entry and
//     recache it.
//   - Need to have a way to clean out old files. If x.zip is on the base,
//     gets cached, and then is deleted, it'll stay on the offload server
//     forever. Getting a 404 from the HEAD request will clean it out, but
//     the offload server needs to know to do that.


//
// Installation:
// You need PHP with --enable-sysvsem support. You should configure PHP to not
//  have a time limit on script execution. This script will not work on
//  Windows.
//
// You need some PEAR modules. As root, run:
//   pear install HTTP
//
//
// You need Apache to push every web request to this script, presumably in a
//  virtual host, if not the entire server.
//
// Assuming this script was at /www/scripts/index.php, you would want to add
//  this to Apache's config:
//
//   AliasMatch ^.*$ "/www/scripts/index.php"
//
//
// Then edit everything down to the "END OF CONFIG VALUES..." comment in this
//  file, and restart the server.
//
//  This file is written by Ryan C. Gordon (icculus@icculus.org).



// GDebug should be false at production time, but this lets you sanity check
//  some things before going live.
$GDebug = false;

// This is the server that you are offloading.
$GBaseServer = 'icculus.org';

// Time in seconds that i/o to $GBaseServer should timeout in lieu of activity.
$GTimeout = 90;

// This is where we'll cache files.
$GOffloadDir = '/usr/local/apache/offload';

// END OF CONFIG VALUES...



require_once 'PEAR.php';
require_once 'HTTP.php';

$GVersion = '0.0.1';


$Guri = $_SERVER['REQUEST_URI'];
if (strcmp($Guri{0}, '/') != 0)
    $Guri = '/' . $Guri;

$GMetaDataDir = $GOffloadDir . '/metadata';
$GFilesDir = $GOffloadDir . '/files';
$GOrigUrl = "http://${GBaseServer}${Guri}";


function do_header($str)
{
    global $GDebug;
    if ($GDebug)
        echo "header('$str');\n";
    else
        header($str);
} // do_header


function recursiveMkdir($path, $mode)
{
    if (!file_exists($path))
    {
        recursiveMkdir(dirname($path), $mode);
        mkdir($path, $mode);
    } // if
} // recursiveMkdir


function sanestrpos($haystack, $needle)
{
    $rc = strpos($haystack, $needle);
    return(($rc === false) ? -1 : $rc);
} // sanestrpos


function loadMetadata($fname)
{
    $retval = array();
    $lines = @file($fname);
    if ($lines === false)
        return($retval);

    $max = count($lines);
    for ($i = 0; $i < $max; $i += 2)
        $retval[trim($lines[$i])] = trim($lines[$i+1]);
    return($retval);
} // loadMetadata


function cachedMetadataMostRecent($metadata, $head)
{
    if (!isset($metadata['Content-Length']))
        return false;

    if (!isset($metadata['ETag']))
        return false;

    if (!isset($metadata['Last-Modified']))
        return false;

    if (strcmp($metadata['Content-Length'], $head['Content-Length']) != 0)
        return false;

    if (strcmp($metadata['ETag'], $head['ETag']) != 0)
        return false;

    if (strcmp($metadata['Last-Modified'], $head['Last-Modified']) != 0)
        return false;

    return true;
} // cachedMetadataMostRecent



// The mainline...

// Feed a fake robots.txt to keep webcrawlers out of the offload server.
if (strcmp($uri, "/robots.txt") == 0)
{
    header('Content-type: text/plain');
    echo "User-agent: *\n";
    echo "Disallow: /\n";
    exit();
} // if


if ($GDebug)
{
    header('Content-type: text/plain');
    echo "Offload Debug Run!\n";
    echo "You want to GET ${Guri} from $GBaseServer.\n";
    echo "  i.e. - $GOrigUrl\n";
    if (!is_dir($GMetaDataDir))
        echo "Metadata dir ('$GMetaDataDir') is missing or invalid!\n";
    if (!is_dir($GFilesDir))
        echo "Files dir ('$GFilesDir') is missing or invalid!\n";
    echo "\n\n";
} // if


if (sanestrpos($Guri, '?') >= 0)
{
    do_header('HTTP/1.0 400 Bad Request');
    do_header('Connection: close');
    do_header('Content-type: text/plain');
    echo "HTTP/1.0 400 Bad Request\n";
    //print_r($result);
    echo "Cannot serve dynamic content from this offload server.\n";
    exit();
} // if

$semid = sem_get(0x8267bc62);  // !!! FIXME: good value?
if ($semid === false)
{
    do_header("HTTP/1.0 503 Service Unavailable");
    do_header('Content-type: text/plain');
    echo "Couldn't allocate semaphore.\n";
    exit();
} // if


$head = HTTP::head($GOrigUrl, $GTimeout);
$d = date('r');
if (PEAR::isError($head))
{
    do_header("HTTP/1.0 503 Service Unavailable");
    do_header("Date: $d");
    do_header('Content-type: text/plain');
    echo "Error: " . $head->getMessage();
    exit();
} // if

if (($head['response_code'] == 401) || (isset($head['WWW-Authenticate'])))
{
    do_header('HTTP/1.0 400 Bad Request');
    do_header("Date: $d");
    do_header('Connection: close');
    do_header('Content-type: text/plain');
    echo "HTTP/1.0 400 Bad Request\n";
    echo "Cannot serve passworded content from this offload server.\n";
    exit();
} // if

else if ($head['response_code'] != 200)
{
    do_header($head['response']);
    do_header("Date: $d");
    if (isset($head['Location']))
        do_header("Location: ${head['Location']}");
    do_header('Connection: close');
    do_header('Content-type: text/plain');
    echo $head['response'];

    //print_r($head);

    if ($head['response_code'] == 404)
    {
        // !!! FIXME: delete from cache, if it currently exists!
    } // if
    exit();
} // else if

else if ( (!isset($head['ETag'])) ||
          (!isset($head['Content-Length'])) ||
          (!isset($head['Last-Modified'])) )
{
    do_header('HTTP/1.0 400 Bad Request');
    do_header('Connection: close');
    do_header('Content-type: text/plain');
    echo "HTTP/1.0 400 Bad Request\n";
    //print_r($head);
    echo "Cannot serve dynamic content from this offload server.\n";
    exit();
} // if

if (!isset($head['Content-Type']))
    $head['Content-Type'] = 'application/octet-stream';

// !!! FIXME: Check Cache-Control, Pragma no-cache

if ($GDebug)
{
    echo "\nThe HTTP HEAD from $GBaseServer...\n";
    print_r($head);
    echo "\n\n";
} // if

$filepath = $GFilesDir .  $Guri;
$metadatapath = $GMetaDataDir . $Guri;
recursiveMkdir(dirname($filepath), 0755);
recursiveMkdir(dirname($metadatapath), 0755);

sem_acquire($semid);

$cacheio = NULL;  // will be non-NULL if we're WRITING to the cache...
$frombaseserver = false;
$io = NULL;  // read from this. May be file or HTTP connection.
$metadata = loadMetadata($metadatapath);

if (cachedMetadataMostRecent($metadata, $head))
{
    $io = @fopen($filepath, 'rb');
    if ($io === false)
    {
        do_header('HTTP/1.0 503 Service Unavailable');
        do_header("Date: $d");
        do_header('Content-type: text/plain');
        echo "Couldn't access cached data.\n";
        sem_release($semid);
        exit();
    } // if

    if ($GDebug)
        echo "File is cached.\n";
} // if

else
{
    ignore_user_abort(true);  // if we're caching, we MUST run to completion!

    $frombaseserver = true;
    $io = @fopen($GOrigUrl, 'rb');
    if ($io === false)
    {
        do_header('HTTP/1.0 503 Service Unavailable');
        do_header("Date: $d");
        do_header('Content-type: text/plain');
        echo "Couldn't retrieve data for caching; please try again later.\n";
        sem_release($semid);
        exit();
    } // if

    // $filepath may be a partial download that is still streaming from the
    //  base server...we count on Unix semantics here; we just unlink the
    //  file and the process that is still streaming the old one will keep
    //  writing to the old inode, oblivious to the new version. When it
    //  finishes streaming from the base server and to the client, it'll
    //  close the file, making the old inode vanish; future accesses to
    //  $filepath will get the latest version.
    @unlink($filepath);
    @unlink($metadatapath);

    $cacheio = @fopen($filepath, 'wb');
    if ($cacheio === false)
    {
        do_header('HTTP/1.0 503 Service Unavailable');
        do_header("Date: $d");
        do_header('Content-type: text/plain');
        echo "Couldn't update cached data.\n";
        fclose($io);
        sem_release($semid);
        exit();
    } // if

    $metaout = @fopen($metadatapath, 'wb');
    if ($metaout === false)
    {
        do_header('HTTP/1.0 503 Service Unavailable');
        do_header("Date: $d");
        do_header('Content-type: text/plain');
        echo "Couldn't update metadata.\n";
        fclose($cacheio);
        @unlink($filepath);
        fclose($io);
        sem_release($semid);
        exit();
    } // if

    // !!! FIXME: This is a race condition...may change between HEAD
    // !!! FIXME:  request and actual HTTP grab. We should really
    // !!! FIXME:  just use this for comparison once, and if we are
    // !!! FIXME:  recaching, throw this out and use the headers from the
    // !!! FIXME:  actual HTTP grab when really updating the metadata.
    //
    // !!! FIXME: Also, write to temp file and rename in case of write failure!
    foreach ($head as $key => $val)
        fputs($metaout, $key . "\n" . $val . "\n");
    fclose($metaout);
    $metadata = $head;

    if ($GDebug)
        echo "Cache needs refresh...pulling from base server...\n";
} // else

sem_release($semid);

// !!! FIXME: partial content:
// client...
//  Range: bytes=347776-
// server...
//  HTTP/1.1 206 Partial Content
//  Accept-Ranges: bytes
//  Content-Length: 239225299
//  Content-Range: bytes 347776-239573074/239573075

do_header('HTTP/1.1 200 OK');
do_header("Date: $d");
do_header("Server: offload.php version $GVersion");
do_header('Connection: close');
do_header('ETag: ' . $metadata['ETag']);
do_header('Last-Modified: ' . $metadata['Last-Modified']);
do_header('Content-Length: ' . $metadata['Content-Length']);
do_header('Content-Type: ' . $metadata['Content-Type']);

$max = $metadata['Content-Length'];
$br = 0;

while (true)
{
    if ($frombaseserver == false)
    {
        $stat = @fstat($io);
        if ($stat === false)
            break;

        $cursize = $stat['size'];
        if ($cursize < $max)
        {
            if (($cursize - $br) <= 8192)  // may be caching on another process.
            {
                sleep(1);
                continue;
            } // if
        } // if
    } // if

    $data = fread($io, 8192);
    if (isset($cacheio))
    {
        fwrite($cacheio, $data);  // !!! FIXME: check for errors!
        fflush($cacheio);
    } // if

    $br += strlen($data);

    if (!connection_aborted())
    {
        if ($GDebug)
            echo "Would have written " . strlen($data) . " bytes.\n";
        else
            echo $data;
    } // if

    if ($br >= $max)
        break;

    if (feof($io))
        break;
} // while

if ($br != $max)
{
    if ($GDebug)
        echo "Bogus transfer! Sent $br, wanted to send $max!\n";
    sem_acquire($semid);
    @unlink($metadatapath);
    @unlink($filepath);
    sem_release($semid);
} // if

exit();

?>