#!/usr/bin/php
<?php
/*
	copyright <original author>
	TJ Fontaine <tjfontaine@gmail.com>

	This script allows the exporting of pages from MediaWiki to MoinMoin.

	Usage:
		chose the options you want and fill in the appropriate 
		variables. Make sure you at the very least edit the database
		settings. 
		
		MM_USER_ID requires at least one user be registered in
		moinmoin, you can find this number in wiki/data/user/

		Once everything is setup run the script, then copy from
		$output_dir/* wiki/data/pages/ and 
		mv wiki/data/pages/edit-log wiki/data/

		Your MediaWiki pages and history should now be available
		to you, check wiki/Main_Page

		By default the script exports namespaces 0-3. It has been
		my experience that namespace 0 are normal editable pages,
		1 is the normal pages Talk sections, 2 are the user pages,
		and 3 are the user page Talk sections. When filling in
		$IMPORT_PAGES if description is set it will export those
		pages to the description:
		
		(example)
			$IMPORT_PAGES['users-talk']['namespace'] = 3;
			$IMPORT_PAGES['users-talk']['description'] = "Talk";

		will cause all pages in that namespace to be exported to 
		User/Talk where as

			$IMPORT_PAGES['users-talk']['namespace'] = 3;
			$IMPORT_PAGES['users-talk']['description'] = "TalkAgain";

		will cause all pages in that namespace to be exported to
		User/TalkAgain.

	Features:
		* Import Current Pages
		* Import By Namespace
		* Import Talk Pages (as Page/Talk)
		* Import Revision History
		* Import Images
		* Add "#format $parser" to header
		* Or make minimal changes to Wiki syntax

	Known Issues:
		* Changing the syntax on large sites will eat up memory
		  that part of the code needs overhauled
		* Thumbnails aren't handled at all
		
	TODO:
		* Migrate Users
		* Map Users in revision history
		* Overhaul change_syntax
		* Image thumbnails

	ChangeLog:
		* 2006-01-12 TJ Fontaine <tjfontaine@gmail.com>
		  - Removed nasty not_pages array
		  - Import based on namespace
		  - Import Talk Pages
		  - Import images (uses find)
		  - Import Revision History
		  - Add Proper Revision Timestamp
		  - Add Conditional Revision Import

		* Version 0.3

		* 2007-11-07 David Huggins-Daines <dhuggins@cs.cmu.edu>
		  - Updated for Mediawiki 0.11
		  - Removed history for the time being (will put it back soon)
*/
########################
##  MediaWiki Options ##
########################

$MIGRATE_IMAGES = false;#set to true if you want to migrate images to moinmoin
$MW_IMAGE_PATH = "";	#full path to mediawiki images

########################
##  MoinMoin Options  ##
########################

$MM_USER_ID = "";	#moinmoin userid to identify the importer by
$ADD_MW_PARSER = true;	#set to true to add #format $MW_PARSER to the begining 
			#of every page if false script does minimal conversion 
			#before hand the old code needs reworked, eats too much
			#memory leave this to true
$MW_PARSER = "media";   #name of mediawiki parser in plugin/parser

########################
##  DB Settings       ##
########################

$MW_TABLE_PREFIX = "";	#mediawiki database was installed with tables prefixed
$host = "";		#mediawiki database server
$usr = "";		#mediawiki database username
$passwd = "";		#mediawiki database password
$db = "";		#mediawiki database name

########################
##  Pages To Import   ##
########################

$IMPORT_PAGES['regular']['namespace'] = 0;
$IMPORT_PAGES['regular']['description'] = "";
$IMPORT_PAGES['regular-talk']['namespace'] = 1;
$IMPORT_PAGES['regular-talk']['description'] = "Talk";
$IMPORT_PAGES['users']['namespace'] = 2;
$IMPORT_PAGES['users']['description'] = "";
$IMPORT_PAGES['users-talk']['namespace'] = 3;
$IMPORT_PAGES['users-talk']['description'] = "Talk";

########################
##  Output Directory  ##
########################

$output_dir = "mediawiki_pages"; #where the script will output the exported
				 #pages

/*
	DO NOT EDIT BELOW THIS LINE
	unless you think you know what you're doing
-----------------------------------------------------
*/

$link = mysql_pconnect($host,$usr,$passwd) or die(mysql_error());
mysql_select_db($db) or die("Could not select database");

$WIKI_LINK_START = "[";
$WIKI_LINK_END = "]";
$EXTERNAL_LINK_START = "[";
$EXTERNAL_LINK_END = "]";
$EXTERNAL_LINK_DIVIDER = " ";

if(file_exists($output_dir)){
   rmdirr($output_dir);
   mkdir($output_dir);
}
else{
   mkdir($output_dir);
}

chdir("./$output_dir") or die;

$EDIT_LOG = array();

foreach($IMPORT_PAGES as $pagetype)
	migrate_current_pages($pagetype['namespace'], $pagetype['description']);

print "sorting Edit Log ...";
asort($EDIT_LOG);
print "Done\n";

$edit_log = fopen("edit-log", "w");
foreach($EDIT_LOG as $entry)
	fputs($edit_log, $entry);
fclose($edit_log);

chdir("..");
###End of Main

function migrate_current_pages($page_namespace, $page_description = "")
{
	$page_table = $GLOBALS['MW_TABLE_PREFIX']."page";
	$text_table = $GLOBALS['MW_TABLE_PREFIX']."text";
	$revision_table = $GLOBALS['MW_TABLE_PREFIX']."revision";

	$curr_sql = "SELECT `$page_table`.page_title as ptitle, " .
		"`$page_table`.page_latest as revision, ".
		"`$text_table`.old_text as text, `$page_table`.page_touched as timestamp " .
		"FROM `$page_table`, `$text_table`, `$revision_table` ".
		"WHERE `$revision_table`.rev_id = `$page_table`.page_latest ".
		"AND `$text_table`.old_id = `$revision_table`.rev_text_id ".
		"AND `$text_table`.old_text NOT LIKE \"MediaWiki default\" " .
		"AND page_namespace = '$page_namespace' " .
		";";

	$query = mysql_query($curr_sql) or die(mysql_error());

	while ($row = mysql_fetch_object($query)) {
		migrate_page_row($row, $page_description);
	}
	mysql_free_result($query);
}

function migrate_page_row($row, $desc)
{
	$timestamp = $row->timestamp;
	$title = clean_title($row->ptitle);
	$text = $row->text;
	$revision = $row->revision;

	if(strlen($desc))
		create_page($title."(2f)".$desc, $text, $timestamp, $revision);
	else
		create_page($title, $text, $timestamp, $revision);
}

function create_page($page_title, $page_text, $page_timestamp, $page_revision)
{
	print 'create page '.$page_title.' revision '.$page_revision."\n";
	
	if(!is_dir($page_title))
		mkdir($page_title) or die($page_title);

	chdir($page_title) or die($page_title);
	
	append_edit_log($page_title, $page_timestamp, $page_revision);
	
	$file = fopen("current", "w");
	fputs($file, $page_revision);
	
	fclose($file);
	
	if($GLOBALS['MIGRATE_IMAGES'])
		migrate_images($page_text);
	
	if(!is_dir("revisions"))
		mkdir("revisions") or die("revisions");	

	chdir("revisions") or die("revisions");
	
	$file = fopen($page_revision, "w");
	
	#break up one string into lines
	$file_text = explode("\n", $page_text);
	
	if($GLOBALS['ADD_MW_PARSER'])
	{
		$mw_parser = $GLOBALS['MW_PARSER'];
		fputs($file, "#format $mw_parser \n");
	}
	else
		$file_text = change_syntax($file_text);
		
	$b = 0;
	
	while ($b < count($file_text)) {
		fputs($file, rtrim($file_text[$b]) . "\n");
		$b++;
	}
	
	unset($file_text);
	fclose($file);
	chdir("..") or die(system('pwd')); #revision
	chdir("..") or die(system('pwd')); #page name
}

function append_edit_log($page_title, $timestamp, $revision)
{
	$file = fopen('edit-log', 'a+');

	if($revision == 0)
		$action = 'SAVENEW';
	else
		$action = 'SAVE';
	
	if(strlen($timestamp))
		$tstamp = getStamp($timestamp);
	else
		$tstamp = uts();
	
	$el_string = "$tstamp\t$revision\t$action\t$page_title\t" .
		"127.0.0.1\tlocalhost\t".$GLOBALS['MM_USER_ID']."\n";
	
	fputs($file, $el_string);

	$GLOBALS['EDIT_LOG'][$tstamp] = $el_string;

	fclose($file);
}

function uts(){
	$Asec = explode(" ", microtime());
	$Amicro = explode(".", $Asec[0]);
	return ($Asec[1].substr($Amicro[1], 0, 6));
}

function getStamp($t)
{
	$year = substr($t, 0, 4);
	$month = substr($t, 4, 2);
	$day = substr($t, 6, 2);
	$hour = substr($t, 8, 2);
	$min = substr($t, 11, 2);
	$sec = substr($t, 13, 2);
	$micro = mktime($hour, $min, $sec, $month, $day, $year);
	return sprintf("%-016s", $micro);
}

function migrate_images($page_text)
{
	$mw_path = $GLOBALS['MW_IMAGE_PATH'];
	$image_matches = array();
	$image_pat = "/\[\[Image:(.*)\]\]/";
	if(preg_match_all($image_pat, $page_text, $image_matches))
	{
		if(!is_dir("attachments"))
			mkdir("attachments");
	
		for($z = 0; $z < count($image_matches[1]); $z++)
		{
			$image_file_name = strtok($image_matches[1][$z], '|');
			if(!file_exists('attachments/'.$image_file_name))
			{
				$find_string = "find $mw_path -type f -name \"".
					"$image_file_name\"";
				
				$image_file_path = system($find_string, $ret);
				if($ret) die($image_file_path);
				if(strlen($image_file_path))
				{
					if(!copy($image_file_path, "./attachments/$image_file_name"))
						die("failed to copy $image_file_name\n");
					print " added attachment: $image_file_name \n";
				}
			}
		}
	}
}

function clean_title ($page_title)
{
	$page_title = utf8_encode(str_replace(" ", "_", $page_title));
	$quoted = array();
	$in_parenthesis = false;
	for ($i = 0; $i < strlen($page_title); $i++)
	{
		$curchar = substr ($page_title, $i, 1);
		if (ereg('[^a-zA-Z0-9_]', $curchar))
		{
			if (!$in_parenthesis)
			{
				$quoted[] = '(';
				$in_parenthesis = true;
			}
			$quoted[] = str_pad(dechex(ord($curchar)),
				2, '0', STR_PAD_LEFT);
		} 
		else 
		{
			if ($in_parenthesis) 
			{
				$quoted[] = ')';
				$in_parenthesis = false;
			}
			$quoted[] = $curchar;
		}
	}
	
	if ($in_parenthesis)
		$quoted[] = ')';
	
	$page_title = implode('', $quoted);
	unset($quoted);
	return $page_title;
}

function change_syntax ($textString) {
	#$a = 0;
   
   for($a = 0; $a < count($textString); $a++){
      #print "str(before mod) = $textString[$a] \n";
      
      #custom plugin
      #if(preg_match("/\<fileshare\>.+\<\/fileshare\>/",$textString[$a])){
      #   $textString[$a] = fileShare($textString[$a]); 
      #}
      
      #strpos : Returns the numeric position of the first occurrence of needle in the haystack string. Unlike the strrpos(), this function can take a full string as the needle parameter and the entire string will be used.
      #substr() returns the portion of string  specified by the start and length parameters.
      #string substr ( string string, int start [, int length] )
      if(substr($textString[$a], 0, 1) == '*'){
         $textString[$a] = bullets($textString[$a]);
      }
      
      if(preg_match("/^#/",$textString[$a])){ 
         $textString[$a] = numberedList( $textString[$a]); 
      }
      
      #headings
      if(preg_match("/^==.+==/",$textString[$a])){ 
         $textString[$a] = heading( $textString[$a]); 
      }
      
      #wikilink
      if(preg_match("/\[\[.+\]\]/",$textString[$a])){
			$textString[$a] = wikiLinks($textString[$a]);
      }

      #media wiki new line <br\> or <BR>
      #must be after wiki links 
      if (preg_match("/\<br\/{0,1}\>/i", $textString[$a])) { 
         $textString[$a] = preg_replace("/\\<br\/{0,1}\>/i", "[[BR]]",$textString[$a]);
         #print "result = $textString[$a]\n";
      }
   }

	return $textString;
}



#custom plugin
#function fileShare($string) {
#   $fileshare = substr($string, strpos($string, "\\\\"));
#   $fileshare = preg_replace("/<\/fileshare>/","",$fileshare);
#   $string = "[file:" .$fileshare ."]";
#   return $string;
#}

function heading($string){
   $theHeading = $string;
   $headingLevel = 0;
   
   #strip the left side '=' chars
   while($headingLevel < strlen($theHeading)){
      if(substr($theHeading, 0, 1) == '='){
         $theHeading = substr($theHeading, 1);
      }
      else{ 
         #no more ='s in front of text
         break; 
      }
      $headingLevel++;
   }
   
   #the left side '=' chars are now removed
   #now strip right side '=' chars
   $theHeading = substr($theHeading, 0, strpos($theHeading, '='));
   
   $theSyntax = "";
   #note moinmoin uses 1 less = for heading levels
   #so mediawiki "===" is the same as moinmoin "=="
   for($i = 1; $i < $headingLevel; $i++){
      $theSyntax .= "=";  
   }
      
   $string = $theSyntax ." $theHeading " .$theSyntax;

   return $string;
}


function bullets ($string) {
	$a = 0;
	while ($a < strlen($string)) {
		$a++;
		if (substr($string, 1, 1) == "*")
			$string = substr($string, 1);
		else
			break;
	}
	while ($a > 0) {
		$string = " " . $string;
		$a--;
   }
	return $string;
}

function numberedList ($string) {
	if(preg_match("/^#/",$string)){
      $string = preg_replace("/^#/", " 1.", $string);
   }
   elseif(preg_match("/^##/",$string)){
      $string = preg_replace("/^##/", "  1.", $string);
   }
	return $string;
}


function wikiLinks ($string) {
   global $WIKI_LINK_START;
   global $WIKI_LINK_END;
   
	while (strpos($string, "[[") !== false && strpos($string, "]]") !== false) {
      #isolate link
		$link = substr($string, strpos($string, "[[") + 2);
		$link = substr($link, 0, strpos($link, "]]") + 0);
      
      if (strpos($link, "|") == false){
         #add new link syntax
         $link = $WIKI_LINK_START ."\"". $link ."\"" .$WIKI_LINK_END;
      }
      else{
         $dividerPosition = strpos($link, "|");
         
         $wikilink = substr($link, 0, $dividerPosition);
         $label = substr($link, $dividerPosition + 1, strlen($link) - $dividerPosition);
         
         #remove whitespace from beginning and end
         $label = trim($label);
         
         $link = $WIKI_LINK_START .":" .$wikilink .": "  .$label .$WIKI_LINK_END;
      }

		$string = substr($string, 0, strpos($string, "[[") - 0) . $link .substr($string, strpos($string, "]]") + 2);
   }
   
	return $string;
}


function externalLinks($string){
   global $EXTERNAL_LINK_START;
   global $EXTERNAL_LINK_END;
   global $EXTERNAL_LINK_DIVIDER;
   
   #external link syntax is the same except for the label divider
   
   if(preg_match("/| /")){
      $string = preg_replace("/| /", " ", $string);
   }
   elseif(preg_match("/|/")){
      $string = preg_replace("/|/", " ", $string);
   }
   
	return $string;
   
}

function rmdirr($dir) {
   if($objs = glob($dir."/*")){
       foreach($objs as $obj) {
           is_dir($obj)? rmdirr($obj) : unlink($obj);
       }
   }
   rmdir($dir);
}

?>