ikiwiki to nikola: the script

People asked for it:

#! /usr/bin/python3

import argparse
from datetime import datetime
from glob import glob
from os import stat
from os.path import basename, splitext
import re
import sys
import time


footnote_re = re.compile(r'\[(?P<foot_number>\d+)\]')
taglink_re = re.compile(r'\[\[!taglink (?P<tag_name>[^\]]*)\]\]')
image_re = re.compile(r'\[\[!img (?P<path>.*)\]\]')
format_start_re = re.compile(r'^\[\[!format (?P<language>.*) """$')
format_end_re = re.compile(r'^"""\]\]$')


def rewrite_footnotes_line(line, text_block, footnote_block, taglink_block, foot_number):
    new_line = line
    changed = False

    while footnote := footnote_re.search(new_line):
        # remove the []s
        start = footnote.start('foot_number') - 1
        end = footnote.end('foot_number') + 1

        prefix = new_line[:start]
        postfix = new_line[end:]
        foot_number = footnote.group('foot_number')

        if text_block:
            new_line = f"{prefix}[^{foot_number}]{postfix}"
        elif footnote_block:
            new_line = f"{prefix}[^{foot_number}]:{postfix}"
        else:
            raise ValueError('found a footnote in the taglink_block!')

        changed = True
    else:
        if not changed and footnote_block and len(line) > 0:
            # '[^]: ' <-- 5 extra chars
            new_line = f"{' ' * (len(foot_number) + 5)}{line.strip()}"

    return new_line, foot_number


def rewrite_footnotes(src):
    lines = src.splitlines()
    hr_count = len([ line for line in lines if line.startswith('---') ])

    new_lines = []
    text_block = True
    footnote_block = False
    taglink_block = False
    hr_seen = 0
    foot_number = ''

    for line in lines:
        line_length = len(line)

        if line_length > 4 and line[:4] == '    ':
            # it's an inline code block, leave alone
            new_lines.append(line)
            continue

        if line.startswith('---'):
            hr_seen += 1

            # if there is only one hr, then we have text + taglink blocks
            # if there are two or more, it's text + footnote + taglink blocks
            if text_block and hr_count >= 2 and hr_seen == hr_count - 1:
                text_block = False
                footnote_block = True
                # don't keep it
                continue
            elif hr_seen == hr_count:
                text_block = False
                footnote_block = False
                taglink_block = True
                # we'll need it later
                new_lines.append(line)
                continue

        try:
            new_line, foot_number = rewrite_footnotes_line(line, text_block, footnote_block, taglink_block, foot_number)
        except Exception as e:
            print(f"got `{e}´ for `{line}´.")
            raise

        new_lines.append(new_line)

    return '\n'.join(new_lines) + '\n'


def rewrite_taglinks(src):
    new_lines = []
    new_tags = []

    for line in src.splitlines():
        if len(line) > 0 and line == '-' * len(line):
            # don't keep it
            continue

        tags = taglink_re.findall(line)
        if len(tags) > 0:
            new_tags.extend(tags)
        else:
            new_lines.append(line)

    return '\n'.join(new_lines) + '\n', new_tags


def rewrite_images(src):
    new_lines = []

    for line in src.splitlines():
        image = image_re.search(line)
        if image is not None:
            # get the text before and after the whole directive
            start = image.start(0)
            end = image.end(0)

            prefix = line[:start]
            postfix = line[end:]
            path = image.group('path')

            # the root to which this 'absolute' path points is the website's root
            new_line = f"{prefix}![](/{path}){postfix}"

            new_lines.append(new_line)
        else:
            new_lines.append(line)

    return '\n'.join(new_lines) + '\n'


lang_map = dict(
    py='python',
    sh='bash',
)


def rewrite_format(src):
    new_lines = []

    for line in src.splitlines():
        start = format_start_re.match(line)
        if start is not None:
            lang = start.group('language')
            # if there's no mapping return the same lang
            new_line = f"```{lang_map.get(lang, lang)}"
            new_lines.append(new_line)
            continue

        if format_end_re.match(line):
            new_lines.append('```')
            continue

        new_lines.append(line)

    return '\n'.join(new_lines) + '\n'


def titlify(src):
    words = src.split('-')
    words[0] = words[0].title()

    return ' '.join(words)


def test_offesetify():
    src = -3600
    dst = '+0100'

    assert offsetify(src) == dst


def offsetify(src):
    hours, seconds = divmod(src, 3600)
    # "offsets are always in minutes" sounds like one item in 'things dveloper believe about timezones'
    minutes, _ = divmod(seconds, 60)

    # NOTE: time.timezone returns seconds west of UTC, which is opposite of what usual offsets go
    if src > 0:
        sign = '-'
    else:
        sign = '+'

    return f"{sign}{-hours:02d}{minutes:02d}"


def datify(src):
    '''1701288755.377908 -> 2023-11-29 21:12:35 +0100'''

    # BUG: I'm gonna assume current timezone.
    # thanks SirDonNick#python@libera.chat
    # dto=DT(2023,11,29, 12,13,59, tzinfo=UTC_TZ);  DT.astimezone( dto , getTZ('Europe/Brussels') ) #==> 2023-11-29 13:13:59+01:00
    offset = time.timezone
    dt = datetime.fromtimestamp(src)

    return f"{dt.strftime('%Y-%m-%d %H:%M:%S')} {offsetify(offset)}"


# zoneinfo for some reason doesn't know about CEST, so I'll just hack a mapping here
tzname_to_utc_offset = dict(
    CEST='+0200',
     CET='+0100',
)

month_name_to_number = dict(
    jan= 1,
    ene= 1,
    feb= 2,
    mar= 3,
    apr= 4,
    abr= 4,
    may= 5,
    jun= 6,
    jul= 7,
    aug= 8,
    ago= 8,
    sep= 9,
    oct=10,
    nov=11,
    dec=12,
    dic=12,
)


def dedatify(src):
    #       0         1      2     3      4       5           6     7
    # src=['Posted', 'Sun', '26', 'Aug', '2012', '11:27:16', 'PM', 'CEST']
    month = month_name_to_number[src[3].lower()]
    utc_offset = tzname_to_utc_offset[src[7]]
    h, m, s = [ int(x) for x in src[5].split(':') ]

    if src[6].upper() == 'PM':
        h += 12  # TODO: support 12PM

    return f"{src[4]}-{month:02d}-{int(src[2]):02d} {h:02d}:{m:02d}:{s:02d} {utc_offset}"


def build_meta(filepath, tags, date=None):
    filename = splitext(basename(filepath))[0]
    if date is None:
        mtime = stat(filepath).st_mtime
        date_string = datify(mtime)
    else:
        date_string = dedatify(date)

    meta = f""".. title: {titlify(filename)}
.. slug: {filename}
.. date: {date_string}
.. tags: {', '.join(tags)}
.. type: text
"""

    return filename, meta


def import_post(opts):
    src = open(opts.filepath).read()
    mid, tags = rewrite_taglinks(rewrite_footnotes(src))
    dst = rewrite_format(rewrite_images(mid))
    if opts.date is None:
        filename, meta = build_meta(opts.filepath, tags)
    else:
        filename, meta = build_meta(opts.filepath, tags, date=opts.date)

    open(f"posts/{filename}.md", 'w+').write(dst)
    open(f"posts/{filename}.meta", 'w+').write(meta)


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('filepath', metavar='FILE')
    parser.add_argument('-d', '--date', nargs=8,
                        help='Just pass something like "Posted Wed 12 Sep 2012 08:19:23 PM CEST".')

    return parser.parse_args()


if __name__ == '__main__':
    opts = parse_args()

    import_post(opts)

I removed all the tests, but they all looked like this:

def test_dedatify():
    src = 'Posted Wed 12 Sep 2012 08:19:23 PM CEST'.split()
    dst = '2012-09-12 20:19:23 +0200'
    assert dedatify(src) == dst

Enjoy.