How-To: Extract PHP-Nuke Articles from an SQL INSERT Statement in Python

I found an old SQL dump of this r3dux.org site yesterday which I thought was lost to time. The site ran php-nuke back then, and rather than importing the SQL into a database and fiddling with it to export all the old articles I thought I’d take the python hammer to it. It’s a bit of a bodge/hack – but it seems to work well enough.

The idea is you copy JUST the insert statement which deals with “stories” (nuke’s word for articles / posts) from the SQL dump and put it in a file (mine’s called “nuke_stories.sql”), then run the parse_nuke_stories python script and it’ll dump out all the articles into separate files with filenames like “Article_ID_YEAR_MONTH_DAY.html” and such.

Here’s the top few lines of my SQL insert statement:

INSERT INTO `nuke_stories` (`sid`, `catid`, `aid`, `title`, `time`, `hometext`, `bodytext`, `comments`, `counter`, `topic`, `informant`, `notes`, `ihome`, `alanguage`, `acomm`, `haspoll`, `pollID`, `score`, `ratings`, `associated`) VALUES
(1, 0, 'r3duxAdmin', 'W00t! The site it be a-changin''...', '2004-09-10 12:35:06', 'Well, PHP-Nuke seems to be up and running, so now it''s time to mess w/ it in a huge way and configure all the millions of options needed to make it just the way I want.\r\n

\r\nDaunted? Me? Er, yeah. You could say that...', '', 0, 83, 1, 'r3duxAdmin', '', 1, '', 0, 0, 0, 0, 0, ''),

Here’s what it’ll dump out into a file called Article_001_2004_09_10.html if this was all that was there:

Article ID : 1
Author : r3duxAdmin
Title : W00t! The site it be a-changin’…
Date : 2004-09-10 12:35:06

Well, PHP-Nuke seems to be up and running, so now it’s time to mess w/ it in a huge way and configure all the millions of options needed to make it just the way I want.

Daunted? Me? Er, yeah. You could say that…

And here’s the actual python (version 3) code to parse the SQL file:

# Function to find a single apostrophe that closes the field and return its location
def find_end_apostrophe(location, story):
    while True:
        # Move along
        location += 1
 
        # Grab two chars
        char1 = story[location:location+1]
        char2 = story[location+1:location+2]
 
        # If they're both apostophes then skip ahead 2 chars
        if char1 == "'" and char2 == "'":
            location += 2
        # An apostrophe and something else? It's the end apostrophe so bail!
        elif char1 == "'" and char2 != "'":
            break
 
    return location
 
# Function to parse a story into separate fields and place them into the parsed_story list
def parse_story(story):
 
    parsed_story = []
 
    # Fields:
    # 0 = `sid`, 1 = `catid`, 2 = `aid`, 3 = `title`, 4 = `time`, 5 = `hometext`, `6 = bodytext`, 7 = `comments`,
    #`8 = `counter`, 9 = `topic`, 10 = `informant`, 11 = `notes`, 12 = `ihome`, 13 = `alanguage`, 14 = `acomm`,
    # 15 = `haspoll`, 16 = `pollID`, 17 = `score`, 18 = `ratings`, 19 = `associated`
 
    # 0 - Add story ID
    temp_end = story.find(",")
    field = story[1:temp_end]
    parsed_story.append(field)
    #print("ID:", field)
 
    # 1 - Add category ID
    temp_start = temp_end + 1
    temp_end = story.find(",", temp_start)
    field = story[temp_start:temp_end]
    parsed_story.append(field)
    #print("Category:", field)
 
    # 2 - Add author
    temp_start = temp_end + 3
    temp_end = story.find("'", temp_start)
    field = story[temp_start:temp_end]
    parsed_story.append(field)
    #print("Author:", field)
 
    # 3 - Add title
    temp_start = temp_end + 4
    temp_end   = find_end_apostrophe(temp_start, story)    
    #print("Title locs:", temp_start, temp_end)
    field = story[temp_start:temp_end].replace("''", "'")
    parsed_story.append(field)
    #print("Title:", field)
 
    # 4- Add time
    temp_start = temp_end + 4
    temp_end = story.find("'", temp_start)
    #print("Time locs:", temp_start, temp_end)
    field = story[temp_start:temp_end]
    parsed_story.append(field) # initial_story_split[4][2:-1])
    #print("Date:", field)
 
    # 5 - Add home text (i.e. the first paragraph)
    temp_start = temp_end + 4
    temp_end   = find_end_apostrophe(temp_start, story)
    #print("Home locs:", temp_start, temp_end)
    field = story[temp_start:temp_end].replace("''", "'").replace("<br><br>\\r\\n", "<br/><br/>").replace("\\r\\n", " ").replace("  ", "")
    parsed_story.append(field)
    #print("Home:", field)        
 
    # 6 = Add body text
    temp_start = temp_end + 3
    temp_end   = find_end_apostrophe(temp_start, story)
    #print("Body locs:", temp_start, temp_end)
    field = story[temp_start:temp_end].replace("''", "'").replace("<br><br>\\r\\n", "<br/><br/>").replace("\\r\\n", " ").replace("  ", "") # Last one is 2 'A with a hat' chars used for bullet points
 
    # If there is no body blank the field, otherwise strip the first char
    if field == "'":
        field = ""
    else:
        field = field[1:]        
 
    parsed_story.append(field)
    #print("Body:", field)
 
    # Not bothered about the rest of the fields
 
    return parsed_story
 
# ---------------------- Main --------------------------
 
#import codecs
#sql_file = codecs.open("nuke_stories.sql", encoding="utf-8")
 
# Read the SQL dump of stories into a list of lines where each line is a story
sql_file = open("nuke_stories.sql", "r")
story_list = sql_file.readlines();
sql_file.close()
 
num_stories = len(story_list)
print("Number of stories: ", num_stories, "\n")
 
# Newline char
nl = "<br/>"
 
outfile = 0
for story_number in range(1, num_stories - 1): # Start from 1 because line 0 is the insert statement itself
 
    # Parse the story (i.e. article)
    parsed_story = parse_story( story_list[story_number] )
 
    # Grab the first section of the date only. Format: YYYY-MM-DD
    date_str = parsed_story[4][:parsed_story[4].find(" ")].replace("-", "_")
 
    # Construct the filename then create the file for writing
    filename = "Article_" + parsed_story[0].zfill(3) + "_" + date_str + ".html"
    outfile = open(filename, "w")
 
    # Dump the parsed story details to file
    outfile.write("Article ID : " + parsed_story[0] + nl)
    # Skipping Category ID - don't care.
    outfile.write("Author     : " + parsed_story[2] + nl)
    outfile.write("Title      : " + parsed_story[3] + nl)
    outfile.write("Date       : " + parsed_story[4] + nl + nl)
    outfile.write(parsed_story[5] + nl + nl) # Home text (i.e. first paragraph)
    outfile.write(parsed_story[6])           # Body text (i.e. rest of article)
 
    # Close the file to save it
    outfile.close()
 
print("All done! =D")

As mentioned, it’s a complete hack – but if you need to do something similar it may work as a starter to get you going…

Downtime

r3dux.org was down for a day or two because I’d started but not enabled the DHCPCD service so enp0s3 wasn’t getting an IP on reboot – and then I forgot to remove a test ‘index.html’ in the server root.

While this was why it was down, the reason it stayed down is that I’m trying to finish my PhD at the moment while also working full time – so as you might imagine, this hobbyist website is one of the last fires to put out when everything is on fire.

Normal service (of old content) is now resumed.

On the upside, my new research paper “SoniFight: Software to provide additional sonification cues to video games for visually impaired players.” (DOI: 10.1007/s40869-018-0059-6) should be published soon in The Computer Games Journal special issue on accessibility.

If you’d like to know more then please feel free to hit up the SoniFight Github repo.

How To: Convert Day Number to Month and Day

I needed to convert the day number, that is, what day it is between 1 and 365, into a month and day without using Calendar / DateTime classes and such today, so I knocked up a few simple functions that work assuming it’s not a leap year (i.e. that February has 28 days).

Here they are if they’re of any use to you:

int dateToDayNumber(int month, int day)
{
	// Catch invalid input and return early
	if (month < 1 || month > 12 || day < 1 || day > 31) return 0;
 
	if (month == 1 ) return       day;
	if (month == 2 ) return 31  + day;
	if (month == 3 ) return 59  + day;
	if (month == 4 ) return 90  + day;
	if (month == 5 ) return 120 + day;
	if (month == 6 ) return 151 + day;
	if (month == 7 ) return 181 + day;
	if (month == 8 ) return 212 + day;
	if (month == 9 ) return 243 + day;
	if (month == 10) return 273 + day;
	if (month == 11) return 304 + day;
	return 334 + day;
}
 
int dayNumberToMonth(int dayNumber)
{
	// Catch invalid input and return early
	if (dayNumber < 1 || dayNumber > 365) return 0;
 
	if (dayNumber <= 31 ) return 1;  // Jan
	if (dayNumber <= 59 ) return 2;  // Feb
	if (dayNumber <= 90 ) return 3;  // Mar
	if (dayNumber <= 120) return 4;  // Apr
	if (dayNumber <= 151) return 5;  // May
	if (dayNumber <= 181) return 6;  // Jun
	if (dayNumber <= 212) return 7;  // Jul
	if (dayNumber <= 243) return 8;  // Aug
	if (dayNumber <= 273) return 9;  // Sep
	if (dayNumber <= 304) return 10; // Oct
	if (dayNumber <= 334) return 11; // Nov
	return 12;                       // Dec
}
 
int dayNumberToDayOfMonth(int dayNumber)
{
	// Catch invalid input and return early
	if (dayNumber < 1 || dayNumber > 365) return 0;
 
	if (dayNumber <= 31 ) return dayNumber;       // Jan
	if (dayNumber <= 59 ) return dayNumber - 31;  // Feb
	if (dayNumber <= 90 ) return dayNumber - 59;  // Mar
	if (dayNumber <= 120) return dayNumber - 90;  // Apr
	if (dayNumber <= 151) return dayNumber - 120; // May
	if (dayNumber <= 181) return dayNumber - 151; // Jun
	if (dayNumber <= 212) return dayNumber - 181; // Jul
	if (dayNumber <= 243) return dayNumber - 212; // Aug
	if (dayNumber <= 273) return dayNumber - 243; // Sep
	if (dayNumber <= 304) return dayNumber - 273; // Oct
	if (dayNumber <= 334) return dayNumber - 304; // Nov
	return dayNumber - 334;                       // Dec
}

A touch cheap, but it gets the job done.