<?php
/*
RSS import for WordPress
by Andrew Shearer (awshearer@shearersoftware.com)

For current version and more info, see:
http://www.shearersoftware.com/software/web-tools/wordpress-rss-import/

This script is currently intended to be run from the command line or from the
web after it has been configured by editing variables in the first few lines
of the script.

To use it, first set the $path variable below to a path to an RSS file or
directory containing a blogBrowser archive (one folder per year, one RSS file
per month.) Then run this script from the command line (php import-rss.php).

Examples:

Import an rss.xml file in this directory:
$path = dirname(__FILE__).'/rss.xml';

Import an rss.xml file with a full path specified (Mac OS X full path):
$path = '/Users/testuser/Sites/mysite/rss.xml';

Import a blogBrowser archive in a folder named C:/documents/weblog,
including monthly RSS files such as weblog/2003/12.xml (Windows full path):
$path = 'C:/documents/weblog';

Future improvements: make this runnable from a web browser. Single RSS files
could be handled through uploads, and multi-file blogBrowser archives could be
specified by base URL.

Revision history:

2003-12-26  ashearer  Improved date conflict resolution with $kUpdatePostsAlways
                      and $kUpdatePostsIfNewer options; added $kTakeNoAction;
                      added more comments
2003-12-22  ashearer  Added blogBrowser archive support; optional mod. dates;
                      mod. date column autocreation
2003-12-21  ashearer  RSS import, initial version


*/

//$path = dirname(__FILE__).'/../archivedir';
//$path = dirname(__FILE__).'/rss.xml';
$path = 'http://www.example.com/rss.xml';

$kCreateModDateField = false;    // autocreate post_modified field?
$kSetModDateField = true;       // import post_modified field from RSS file?
$kUpdatePostsAlways = true;    // true to import RSS version even if it replaces current version
$kUpdatePostsIfNewer = false;    // if true, in case of conflict, use newer version; requires post_modified field
$kTakeNoAction = false;          // like -n flag; report actions but don't actually change DB

error_reporting(E_ALL);

class
post {
    var
$title;
    var
$content;
    var
$createDate;
    var
$modDate;
    var
$guid;
    var
$categories;
    var
$postTitle;
}

$kExcludeCategories = array('Testing' => '');

$currentPost = null;
$currentText = '';

function
parseDateISO8601($input) {
    
// returns the date in SQL (MySQL, at least)-compatible text format
    
return substr($input, 0, 10) . ' ' . substr($input, 11, 8);
}

function
parseDateRFC822($input) {
    
// returns the date in SQL (MySQL, at least)-compatible text format
    
return strftime('%Y-%m-%d %H:%I:%S', strtotime($input));
}

function
startElement($parser, $name, $attrs) {
    global
$currentPost, $currentText, $currentGuidAttrs;
    if (
$name == 'item') {
        
$currentPost = new post();
        
$currentPost->categories = array();
    }
    elseif (
$name == 'guid') {
        
$currentGuidAttrs = $attrs;
    }
    
$currentText = '';
}

function
endElement($parser, $name) {
    global
$currentPost, $currentText;
    
    switch (
$name) {
        case
'title': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# title':
            
$currentPost->title = $currentText;
            break;
        
        case
'content:encoded': case 'http://purl.org/rss/1.0/modules/content/ encoded':
            
$currentPost->content = $currentText;
            break;
            
        case
'description': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# description':
            
// content:encoded trumps description, so only save the description
            // if there's no content already
            
if (!isset($currentPost->content) || !strlen($currentPost->content)) {
                
$currentPost->content = $currentText;
            }
            break;
        
        case
'pubDate':
            
$currentPost->createDate = parseDateRFC822($currentText);
            break;
        
        case
'dc:date': case 'http://purl.org/dc/elements/1.1/ date':
            
$currentPost->createDate = parseDateISO8601($currentText);
            break;
        
        case
'dcterms:modified': case 'http://purl.org/dc/terms/ modified':
            
$currentPost->modDate = parseDateISO8601($currentText);
            break;
        
        case
'category': case 'dc:subject': case 'http://purl.org/dc/elements/1.1/ subject':
            
$currentPost->categories[] = $currentText;
            break;
        
        case
'guid':
            if (isset(
$currentGuidAttrs['isPermaLink']) && $currentGuidAttrs['isPermaLink'] == 'true') {
                
$currentPost->permalink = $currentText;
            }
            
$currentPost->guid = $currentText;
            break;
        
        case
'item': case 'http://www.w3.org/1999/02/22-rdf-syntax-ns# item':
            
processPost($currentPost);
            
$currentPost = null;
            break;
    }
    
    
$currentText = '';
}

function
characterData($parser, $data) {
    global
$currentText;
    
$currentText .= $data;
}



// WordPress-specific code


$post_author = 'admin';

require_once(
'../wp-config.php');
require_once(
ABSPATH.WPINC.'/template-functions.php');
require_once(
ABSPATH.WPINC.'/functions.php');
require_once(
ABSPATH.WPINC.'/vars.php');

if (
$kCreateModDateField && !$kTakeNoAction) {
    require_once(
ABSPATH.'/wp-admin/install-helper.php');
    
$res = '';
    
$tablename = $tableposts;
    
$ddl = "ALTER TABLE $tableposts ADD COLUMN post_modified datetime";
    
maybe_add_column($tablename, 'post_modified', $ddl);
    if (
check_column($tablename, 'post_modified', 'datetime')) {
        
$res .= $tablename . ' - ok <br />'."\n";
    } else {
        
$res .= 'There was a problem with ' . $tablename . '<br />'."\n";
        
//++$error_count;
    
}
    echo
$res;
}


function
processPost(&$post) {
    global
$kSetModDateField, $kUpdatePostsAlways, $kUpdatePostsIfNewer, $kTakeNoAction;
    
    
//print_r($post);
    
    // Filter out (ignore) posts having categories that are all listed as "excluded"
    // If a post has no categories, or at least one non-excluded category, it is still
    // included.
    
if (sizeof($post->categories)) {
        
$gotIncludedCategory = false;
        foreach (
$post->categories as $categoryName) {
            if (!isset(
$kExcludedCategories[$categoryName])) {
                
$gotIncludedCategory = true;
                break;
            }
        }
        if (!
$gotIncludedCategory) {
            return;
        }
    }
    
    global
$post_author, $kExcludeCategories;
    global
$wpdb;
    global
$tableusers, $tableposts, $tablepost2cat, $tablecategories;

    
$post_author_ID = $wpdb->get_var("SELECT ID FROM $tableusers WHERE user_login = '$post_author'");
    
    
$post_content = $post->content;
    
$post_content = str_replace('<br>', '<br />', $post_content); // XHTMLify <br> tags
    
    /* Un-word-wrap the content, because <br /> tags will be added at display time
    for line breaks, and RSS feeds are often already soft-wrapped. Replace \n and \r
    with spaces.
    
    However, we don't want to remove word wrapping inside <pre> tags. Stopping short
    of a full HTML parser, we only un-wrap those sections not inside <pre> tag pairs.
    (This code could be misled by things that look like <pre> tags wrapped in HTML comments,
    but oh well.)
    */
    /*$pos = $lastpos = 0;
    while ($lastpos !== false && ($pos = strpos($post_content, '<pre>', $lastpos)) !== false) {
        $post_content = substr($post_content, 0, $lastpos)
            . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos, $pos - $lastpos)))
            . substr($post_content, $pos);
        $lastpos = strpos($post_content, '</pre>', $pos);
    }
    if ($lastpos !== false) {
        $post_content = substr($post_content, 0, $lastpos)
            . str_replace("\n", ' ', str_replace("\r", ' ', substr($post_content, $lastpos)));
    }
    */
    
    
$post_content = addslashes($post_content);
    
    
#$post_content = str_replace("\r", ' ', $post_content);
    #$post_content = str_replace("\n", ' ', $post_content);
    
$post_date = addslashes($post->createDate);
    
$post_title = addslashes($post->title);
    
$post_modified = $kSetModDateField ? addslashes($post->modDate) : '';
    
$post_name = '';
    if (isset(
$post->permalink) && strlen($post->permalink)) {
        
$matches = array();
        if (
preg_match('|/[0-9]{4}/[0-9]{2}/[0-9]{2}/([A-Za-z0-9_-]*)/?|', $post->permalink, $matches)) {
            
$post_name = $matches[1];
            
$post_name = mysql_escape_string($post_name);
        }
    }
        
    
    
$categoryIDList = array();
    foreach (
$post->categories as $categoryName) {
        if (isset(
$kExcludedCategories[$categoryName])) {
            continue;
        }
        
$categoryID = $wpdb->get_var("SELECT cat_ID FROM $tablecategories WHERE cat_name = '".mysql_escape_string($categoryName)."'");
        if (!
$categoryID) {
            if (
$kTakeNoAction) {
                echo
"Would have inserted new category '$categoryName'.";
                
$categoryID = 0;
            }
            else {
                
$categoryNiceName = sanitize_title($categoryName);
                
$wpdb->query("INSERT INTO $tablecategories
                    (cat_name, category_nicename)
                  VALUES
                    ('"
.mysql_escape_string($categoryName)."','".mysql_escape_string($categoryNiceName)."')");
                
$categoryID = $wpdb->get_var("SELECT LAST_INSERT_ID()");
            }
        }
        else {
            
// category already exists; could update its nicename here if it tended not to be correct already.
            //$wpdb->query("UPDATE $tablecategories SET category_nicename = '".mysql_escape_string(sanitize_title($categoryName))."' WHERE cat_ID = ".intval($categoryID));
        
}
        
$categoryIDList[] = $categoryID;
    }
    
    
    
// Quick-n-dirty check for dups:
    
if ($kUpdatePostsIfNewer) {
        
$dupcheck = $wpdb->get_results("SELECT ID,post_date,post_title,post_modified FROM $tableposts WHERE post_date='$post_date' AND post_title='$post_title' LIMIT 1",ARRAY_A);
    }
    else {
        
$dupcheck = $wpdb->get_results("SELECT ID,post_date,post_title FROM $tableposts WHERE post_date='$post_date' AND post_title='$post_title' LIMIT 1",ARRAY_A);
    }
    if (
$dupcheck[0]['ID']) {
        
// post already exists
        
if ($kUpdatePostsAlways || ($kUpdatePostsIfNewer && $kSetModDateField && $dupcheck[0]['post_modified'] < $post_modified)) {
            print
"<br />\n\nUpdating post, ID = '" . $dupcheck[0]['ID'] . "'<br />\n";
            print
"Timestamp: " . $post_date . "<br />\n";
            print
"Post Title: '" . stripslashes($post_title) . "'<br />\n";
            if (!
$kTakeNoAction) {
                
$postID = $dupcheck[0]['ID'];
                
$result = $wpdb->query("
                UPDATE $tableposts
                    SET post_author = '$post_author_ID', post_date = '$post_date',
                    "
.($kSetModDateField ? "post_modified = '$post_modified', " : "")."
                    post_content='$post_content',
                    post_title = '$post_title', post_name = '$post_name' WHERE ID = "
.intval($postID));
                
//echo "DELETE FROM $tablepost2cat WHERE post_id = ".intval($postID);
                
$result = $wpdb->query("DELETE FROM $tablepost2cat WHERE post_id = ".intval($postID));
                foreach (
$categoryIDList as $categoryID) {
                    
$result = $wpdb->query("
                        INSERT INTO $tablepost2cat
                            (post_id, category_id)
                        VALUES
                            ("
.intval($postID).",".intval($categoryID).")
                        "
);
                }
            }
        }
        else {  
            print
"<br />\n\nSkipping duplicate post, ID = '" . $dupcheck[0]['ID'] . "'<br />\n";
            print
"Timestamp: " . $post_date . "<br />\n";
            print
"Post Title: '" . stripslashes($post_title) . "'<br />\n";
        }
    }
    else {
        print
"<br />\nInserting new post.<br />\n";
        print
"Timestamp: " . $post_date . "<br />\n";
        print
"Post Title: '" . stripslashes($post_title) . "'<br />\n";
        if (!
$kTakeNoAction) {
            
$result = $wpdb->query("
            INSERT INTO $tableposts
                (post_author,post_date,post_content,post_title,post_name,post_category"
.($post_modified ? ",post_modified" : "").")
            VALUES
                ('$post_author_ID','$post_date','$post_content','$post_title','$post_name','1'"
.($post_modified ? ",'$post_modified'" : "").")
            "
);
            
$postID = $wpdb->get_var("SELECT LAST_INSERT_ID();");
            if (
$postID) {
                foreach (
$categoryIDList as $categoryID) {
                    
$result = $wpdb->query("
                    INSERT INTO $tablepost2cat
                        (post_id, category_id)
                    VALUES
                        ("
.intval($postID).",".intval($categoryID).")
                    "
);
                }
            }
        }
    }
}

// XML parsing code
function importRSSFile($filePath) {
    if (
function_exists('xml_parser_create_ns')) {
        
$xml_parser = xml_parser_create_ns('iso-8859-1',' ');    // space sep for namespace URI
    
}
    else {
        
$xml_parser = xml_parser_create();
    }
    
// make sure to turn off case-folding; XML 1.0 is case-sensitive
    
xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, false);
    
xml_set_element_handler($xml_parser, "startElement", "endElement");
    
xml_set_character_data_handler($xml_parser, "characterData");
    if (!(
$fp = fopen($filePath, "r"))) {
        die(
"could not open XML input");
    }
    
    while (
$data = fread($fp, 4096)) {
        if (!
xml_parse($xml_parser, $data, feof($fp))) {
            die(
sprintf("XML error: %s at line %d",
                        
xml_error_string(xml_get_error_code($xml_parser)),
                        
xml_get_current_line_number($xml_parser)));
        }
    }
    
xml_parser_free($xml_parser);
    
fclose($fp);
}

function
importBlogArchive($dirPath) {
    
$startYear = 1980;
    
$endYear = intval(strftime('%Y'));
    for (
$testYear = $startYear; $testYear <= $endYear; $testYear++) {
        for (
$testMonth = 1; $testMonth <= 12; $testMonth++) {
            
$rssFilePath = $dirPath.'/'.$testYear.'/'.($testMonth < 10 ? '0' : '').$testMonth.'.xml';
            if (
is_file($rssFilePath)) {
                
importRSSFile($rssFilePath);
            }
        }
    }
}


if (
is_dir($path)) {
    
importBlogArchive($path);
}
else {
    
importRSSFile($path);
}

/*echo '<pre>';
print_r($EZSQL_ERROR);
echo '</pre>';
*/

?>