#!/usr/bin/perl # Terrible, horrible, no-good very bad Perl script for parsing flat-file HTML files into a format that # can be imported by Movable Type. # # SYNTAX: ./jotsheet_conversion.pl [inputfile] > [outputfile] # # This script really chokes if there are DIV's within the "contentbox" # DIV's. I tried to address this but failed. :) # # I wrote it, blame me. --tom sherman [_bleach at yahoo dot com] # ### # # This script deals with this kind of HTML from the old flat files. There # are some variations on this structure, so the regex is a bit complex. # #

01 jul 02 : title

# #
# # [content] # #
$dateAndTitleH3 = '

<\/a>]+>([^:]+):?\s*([^<]*)<\/h3>'; $dateAndTitleH4 = '

<\/a>\[?]*>([^<]*)<\/a>\s?:?\s?([^\]]*)\]?\s*<\/h4>'; $mark = 0; open(IN, "<$ARGV[0]"); while() { $line = $_; # Primary entry [h3], replace with MT syntax if ($line =~ /$dateAndTitleH3/) { $year = $1; $month = $2; $day = $3; $entryDateInTitle = $4; $entryTitle = $5; if (!($entryTitle)) { $entryTitle = $entryDateInTitle; $entryTitle =~ s/^([^>]+)<\/a>$/$1/; } $line =~ s/.*/-----\nKEYWORDS:\n\n-----\n\n\n--------\nAUTHOR: tom\nTITLE: $entryTitle\nSTATUS: Publish\nALLOW COMMENTS: 0\nCONVERT BREAKS: 0\nALLOW PINGS: 0\nPRIMARY CATEGORY: Old Jotsheet Entries\n\nDATE: $month\/$day\/$year 12:01:00 AM\n-----\nBODY:/; } # Secondary entry [h4], properly format in HTML if ($line =~ /$dateAndTitleH4/) { $entryPermlink = $1; $entryNumWord = $2; $entryTitle = $3; if (!($entryTitle)) { $entryTitle = $entryNumWord; $entryTitle =~ s/^([^>]+)<\/a>$/$1/; } $line =~ s/.*/<\/a>

$entryTitle\<\/a><\/h4>/; } elsif ($line =~ //) { $line =~ s/.*/-----\nKEYWORDS:\n\n-----\n/; } elsif ($line =~ /<\/body><\/html>/) { $line =~ s/.*//; } elsif ($line =~ /^
/) { $line =~ s/.*//; $mark = 1; } # We need to allow non-"contentbox" DIVs elsif ($line =~ /^
/) { if ($mark = 1) { $line =~ s/.*//; } else { $mark = $mark - 1; } } print "$line"; } close(IN); exit;