#!/usr/local/bin/php

# Copy data from vqWiki to MoinMoin wiki
#  Jeff Olson <jeff@olsonzoo.com> - October 20, 2005   
#  Based on "mediawiki2moin.php" from http://moinmoin.wikiwikiweb.de/MediaWikiConverter
#  
#  Version 1: 10/20/2005
#  Version 2: 1/20/2006
#
#  - Copies all current and historical versions
#  - Copies attachments
#  - Tries to convert all wiki markup
#

<?php

# Set these variables.  
# - $inputDir is location of vqWiki data files
# - $outputDir is location where MoinMoin pages directory is located...
#   WARNING! Any existing pages in $outputDir will be deleted if they exist in $inputDir
# - $ip is IP address to use in edit-log 
# - $address is DNS name of server to use in edit log

$inputDir = '/home/wiki/wiki';
$outputDir = '/codeswiki/data';
$ip = 0.0.0.0;
$address = something.example.com;

echo "*****\n\nReading Existing Files\n*****\n";

# This might run a while, so set time limit high (1 hour?)
set_time_limit(3600);

# Read input data from vqWiki
$a = 0;
if ($handle = opendir($inputDir)) 
{
	echo "Directory handle: $handle\n";
	echo "Files:\n";

	/* This is the correct way to loop over the directory. */
	while (false !== ($file = readdir($handle))) 
	{
		#$fullPath = $inputDir . "/" . $file;
		#$fp = fopen($fullPath, "r");
		
		if (fnmatch("*.txt", $file) 
			#&& fnmatch("S*.txt", $file)  # uncomment if testing on specific files
		   ) 
		{
			#echo "$file\n";
			$title[$a] = preg_replace("/\.txt/", "", $file);
			$a++;
		}
	}

	closedir($handle);
}

# Get all revisions 
$versionsDir = "$inputDir/versions";
chdir($versionsDir);
for ($i = 0; $i < count($title); $i++)
{
	echo "$title[$i]............................";
	$revisionTitle[$i] = glob("$title[$i].txt.*");
	#print_r($revisionTitle[$i]);
	echo count($revisionTitle[$i]) . " versions found\n";

		$fullPath = $inputDir . "/" . $file;
		$fp = fopen($fullPath, "r");

}


# Go to output directory for MoinMoin wiki
echo "\n\n*****\nCreating New Files\n*****\n";
chdir($outputDir) or die;
chdir("pages") or die;

$count = count($title);
for ($i = 0; $i < $count; $i++) 
{
	echo "\n$i: $title[$i]\n";
	
	# Fix title
	$title[$i] = fix_title($title[$i]);
	echo "\tfixed: $title[$i]\n";
	
	# Delete existing directory for specific page
	#echo "deleting any existing folder with name $title[$i]\n";
	system("rm -rf \"$title[$i]\"");

	# Remake directory for specific page
	#echo "trying to make $title[$i]\n";
	mkdir($title[$i]) or die;
	
	# Change to specific page directory
	#echo "trying to change to $title[$i]\n";
	chdir($title[$i]) or die;
	
	#echo "current dir: " . getcwd() . "\n";
	
	# Write out current revision to "current" file
	#  - pad to 8 digits
	$numberOfRevisions = count($revisionTitle[$i]);
	$lastRevisionNumber = sprintf("%08d", $numberOfRevisions);
	$file = fopen("current", "w");
	fputs($file, $lastRevisionNumber);
	fclose($file);

	# Create an "edit-log" file and keep it open to write out our info to it
	$editLogFile = fopen("edit-log", "w");

	# Create "revisions" directory & change to it
	mkdir("revisions") or die;
	chdir("revisions") or die;
	#echo "current dir: " . getcwd() . "\n";
	
	echo "\tVersions: ";
	
	# Go through each version of the page
	for ($j = 0; $j < count($revisionTitle[$i]); $j++)
	{
		echo ($j + 1) . "...";
		#echo "\tFilename = " . $revisionTitle[$i][$j] . "\n";
		
		# get file contents
		$fullPath = $versionsDir . "/" . $revisionTitle[$i][$j];
		$fp = fopen($fullPath, "r");
		$file_text = "";
		if (filesize($fullPath) > 0)
		{
			$file_text = fread($fp, filesize($fullPath));
		}
		$file_text = explode("\n", $file_text);
		
		# Copy text from old to new, fixing syntax as we go 
		# - also pass in title for copying attachments and input & output directory to copy them
		$file_text = change_syntax($file_text, $title[$i], $inputDir, $outputDir);
		
		# open file for writing -- each file is a eight-digit zero-padded number, starting with 1 (so use $j+1)
		$revisionNumber = sprintf("%08d", $j+1);
		$file = fopen($revisionNumber, "w");
		
		# Create output file
		$k = 0;
		while ($k < count($file_text)) 
		{
			fputs($file, rtrim($file_text[$k]) . "\n");
			$k++;
		}
		unset($file_text);
		
		# Close file
		fclose($file);
		
		# get modification time from filename (format is filename.yyyy.mm.dd.hh.MM.ss)
		preg_match("/(\d{4})\.(\d{2})\.(\d{2})\.(\d{2})\.(\d{2})\.(\d{2})/", $revisionTitle[$i][$j], $modTimeString);
#		echo $modTimeString[0] . "\n";

		$year = $modTimeString[1];
		$month = $modTimeString[2];
		$day = $modTimeString[3];
		$hour = $modTimeString[4];
		$minute = $modTimeString[5];
		$second = $modTimeString[6];
		
		$modTime = mktime($hour, $minute, $second, $month, $day, $year);
#		echo $modTime . "\n\n";
		
		# touch file (named "revisionNumber") to correct modification time
		touch($revisionNumber, $modTime);
		
		# Write entry in edit-log file
		$time = $modTime . "000000";
		$j == 0 ? $label = "SAVENEW" : $label = "SAVE";
		$entry = "$time\t$revisionNumber\t$label\t$title[$i]\t$ip\t$address\n";
		fputs($editLogFile, $entry);

	}
	echo "\n";
	
	# Close edit-log file
	fclose($editLogFile);
	
	# Back up to specific page directory
	chdir("..") or die;
	#echo "current dir: " . getcwd() . "\n";

	# chown & chmod to set correct permissions (this assumes we're running script as root)
	system("chown -R apache:apache .");
	system("chmod -R g+w .");
	system("chmod -R o-rx .");
	
	# Back up to "pages" directory 
	chdir("..") or die;
}

# Back up to Moin directory
chdir("..") or die;

function fix_title($title)
{
	$title = utf8_encode(str_replace(" ", "_", $title));
	$title = utf8_encode(str_replace("+", "_", $title));
	return $title;
}

function change_syntax ($array, $pageTitle, $inputDir, $outputDir) 
{
	# initialize
	$in_preformatted_text = 0;
	$in_multiple_line_code = 0;	
	$in_multiple_line_java_code = 0;
	$in_multiple_line_html_code = 0;

	# patterns
	$java_start_tag_pattern = "/\[<java>\]/";
	$java_end_tag_pattern = "/\[<\/java>\]/";
	$html_start_tag_pattern = "/\[<html>\]/";
	$html_end_tag_pattern = "/\[<\/html>\]/";
	
	for ($a = 0; $a < count($array); $a++) 
	{
		# assign row as a reference to current array item
		$row =& $array[$a];
		
		# Handle multiple-line preformatted text
		if ($in_preformatted_text) 
		{
			# found the end?
			if (preg_match("/^\s*$/", $row)) 
			{
				$row = "}}}";
				$in_preformatted_text = 0;
			}
			else 
			{
				# do nothing - skip all other substitutions
				continue;		
			}
		}

		# Handle multiple-line code
		elseif ($in_multiple_line_code)
		{
			# found the end?
			if (preg_match("/}}}/", $row)) 
			{
				$in_multiple_line_code = 0;
			}
			else 
			{
				# do nothing - skip all other substitutions
				continue;		
			}
		}

		# Handle multiple-line java code
		elseif ($in_multiple_line_java_code)
		{
			# found the end?
			if (preg_match($java_end_tag_pattern, $row)) 
			{
				$row  = preg_replace($java_end_tag_pattern, "}}}", $row);
				$in_multiple_line_java_code = 0;
			}
			else 
			{
				# do nothing - skip all other substitutions
				continue;		
			}			
		}

		# Handle multiple-line html code
		elseif ($in_multiple_line_html_code)
		{
			# found the end?
			if (preg_match($html_end_tag_pattern, $row)) 
			{
				$row  = preg_replace($html_end_tag_pattern, "}}}", $row);
				$in_multiple_line_html_code = 0;
			}
			else 
			{
				# do nothing - skip all other substitutions
				continue;		
			}			
		}

		# Not in multiple-line preformatted text or multiple-line code block
		else 
		{
			# Preformatted text - @@@@ on line by self, but ending on another line where it's all blank
			if (preg_match("/^@@@@\s*$/", $row)) 
			{
				$row  = preg_replace("/^\s*@@@@\s*$/", "{{{", $row);	
				$in_preformatted_text = 1;	

				# Don't do any more processing on this line
				continue;
			}
			
			# Code - {{{ xxx }}} - may be on same or different lines
			if (preg_match("/{{{/", $row)) 
			{
				# if we don't find the closing braces, we are in a multiple-line code situation
				if (!preg_match("/}}}/", $row)) 
				{
					$in_multiple_line_code = 1;	
				}

				# Don't do any more processing on this line
				continue;
			}
			
			# Java Code - [<java>] xxx [</java>] - may be on same or different lines
			if (preg_match($java_start_tag_pattern, $row)) 
			{
				$row  = preg_replace($java_start_tag_pattern, "{{{#!java", $row);

				# if we don't find the closing tag, we are in a multiple-line java code situation
				if (!preg_match($java_end_tag_pattern, $row)) 
				{
					$in_multiple_line_java_code = 1;	
				}
				# otherwise, replace end tag
				else
				{
					$row  = preg_replace($java_end_tag_pattern, "}}}", $row);
				}

				# also add line break after open tag
				$row  = preg_replace("/{{{#!java/", "{{{#!java\n",  $row);

				# Don't do any more processing on this line
				continue;
			}
			
			# HTML Code - [<html>] xxx [</html>] - may be on same or different lines
			if (preg_match($html_start_tag_pattern, $row)) 
			{
				#echo "in html\n";
				$row  = preg_replace($html_start_tag_pattern, "{{{#!html", $row);

				# if we don't find the closing tag, we are in a multiple-line html code situation
				if (!preg_match($html_end_tag_pattern, $row)) 
				{
					$in_multiple_line_html_code = 1;	
				}
				# otherwise, replace end tag
				else
				{
					$row  = preg_replace($html_end_tag_pattern, "}}}", $row);

				}
				# also add line break after open tag
				$row  = preg_replace("/{{{#!html/", "{{{#!html\n",  $row);

				#echo "$row\n";

				# Don't do any more processing on this line
				continue;
			}

			# Tables
			$row  = preg_replace("/####/", "", $row );                  # don't need these
			$row  = preg_replace("/^([^#]+)##/", "||$1||", $row, 1);    # add 1st column start marker
			$row  = preg_replace("/##/", "||", $row );                  # all other markers
			
			# Backtick links: `link` => ["link"] - must come before 'No formatting code'
			$row  = preg_replace("/`([^`]+)`/", "[\"$1\"]", $row);     	
			
			# C2 links
			$row  = preg_replace("/c2:/", "wiki:Wiki:", $row);     	
			
			# No formatting code (__) - must come before underline conversion step
			$row  = preg_replace("/__([^_]+)__/", "`$1`", $row);    
			
			# Underline: ===text=== => __text__ (must come before headings)
			$row  = preg_replace("/===([^=]+)===/", "__$1__", $row);    # underline
			
			# Headings
			$row  = preg_replace("/!!!([^!]+)!!!/", "= $1 =", $row);    # heading level 1
			$row  = preg_replace("/!!([^!]+)!!/", "== $1 ==", $row);    # heading level 2
			$row  = preg_replace("/!([^!]+)!/", "=== $1 ===", $row);    # heading level 3
			
			# Bulleted Lists: (there may be a better way to do this)
			$row  = preg_replace("/^\t\*/", " * ", $row);     			# bullet indented 1
			$row  = preg_replace("/^\t\t\*/", "   * ", $row);     			# bullet indented 2
			$row  = preg_replace("/^\t\t\t\*/", "     * ", $row);     			# bullet indented 3
	
			# Numbered Lists: # => 1.  (note: there may be a better way to do this)
			$row  = preg_replace("/^\t\#/", " 1. ", $row);     			# item indented 1
			$row  = preg_replace("/^\t\t\#/", "   1. ", $row);     			# item indented 2
			$row  = preg_replace("/^\t\t\t\#/", "     1. ", $row);     			# item indented 3
			
			# Line breaks inside lists
#			echo "$row\n";
			if (preg_match("/^(\s*)(1\.|\*)(.*)@@/", $row, $matches)) 
			{
				$leadingSpaces = $matches[1];
				#echo "spaces: ->$leadingSpaces<- \n";
				#echo "before: $row\n";
				$row  = preg_replace("/@@/", "\n$leadingSpaces", $row);  # add two spaces
				#echo "after : $row\n";
			}
			
			# Other line breaks - appearing anywhere else
			$row  = preg_replace("/@@/", " [[BR]] ", $row);
			
			# Attachments: attach: -> attachment:   - Also copy attachments to new wiki
			# does not handle attachments in this format:  attach:"File name with spaces" -- fix those manually
			
			$attachmentPattern = "/attach:([\w.-]+)/"; # this is not a complete filename regex, but works for me!!!
			
			if (preg_match($attachmentPattern, $row, $attachmentMatches))
			{
				# Fix syntax
				$row  = preg_replace($attachmentPattern, "attachment:$1", $row);
				
				# Copy file attachments: note this assumes there is only one attachment per line!!
				$attachmentFilename = $attachmentMatches[1];
				
				$existingLocation = "$inputDir/upload/jsp/$attachmentFilename";
				#echo "existing location: $existingLocation\n";
				
				$newDirectory = "$outputDir/pages/$pageTitle/attachments"; 
				$newLocation = "$newDirectory/$attachmentFilename";
				#echo "new location: $newLocation\n";

				if (!is_dir($newDirectory))
				{
					#echo "making new attachments directory: $newDirectory\n";
					mkdir($newDirectory);
				}
				#echo "current dir: " . getcwd() . "\n";
				echo "\tattachment...$attachmentFilename\n";
				copy($existingLocation, $newLocation);
			}
			
	
			# Horizontal rules - no conversion necessary
			
			# Bold/italic - no conversion necessary


			# Handle line break issue
			# Look at next line
			if ($a+1 < count($array)) # only proceed if there are more lines
			{  
				
				$nextRow = $array[$a+1];
				$emptyRowPattern = "/^\s*$/";
				
				# figure out if we should add a line break - only if all of these conditions are met
				if (!preg_match($emptyRowPattern, $row)               # current row is not empty
					&& !preg_match("/----/", $row)               	  # current row does not have horizontal rule
					&& !preg_match("/=+[^=]+=+/", $row)               # current row is not a heading
					&& !preg_match($emptyRowPattern, $nextRow)        # next row is not empty
					&& !preg_match("/^\t+[\*\#]/", $nextRow)          # next row doesn't start with bullet or numbered item
					&& !preg_match("/##/", $nextRow)           		  # next row doesn't contain table markup
				   )    
				{
					# only if all above conditions are met do we add a break
					$row .= " [[BR]]"; # include space before to prevent "Java:[[BR]] making an Interwiki link, among other things
				}
			}
		}		

	}
	return $array;
}


# Code to fix titles that I did not need
/*	$quoted = array();
	$in_parenthesis = false;
	for ($i = 0; $i < strlen($title[$a]); $i++) 
	{
		$curchar = substr ($title[$a], $i, 1);
		if (ereg('[^a-zA-Z0-9_]', $curchar)) 
		{
			if (!$in_parenthesis) 
			{
				$quoted[] = '(';
				$in_parenthesis = true;
			}
			$quoted[] = str_pad(dechex(ord($curchar)), 2, '0', STR_PAD_LEFT);
		} 
		else 
		{
			if ($in_parenthesis) 
			{
				$quoted[] = ')';
				$in_parenthesis = false;
			}
			$quoted[] = $curchar;
		}
	}
	if ($in_parenthesis)
	{
		$quoted[] = ')';
	}
	$title[$a] = implode('', $quoted);
	unset($quoted);
*/
?>