Exporting Blog posts from WordPress to Sphinx

Updated: Jun 14, 2021

Taking Backup from WordPress

WordPress 1 provides way to export 2 the entire blog as one single XML file, it also provides way to backup multimedia data in the same page where you take the blog backup. I took both the backups and kept it in my laptop. My entire blog is now in a xml file. I need to parse that file and generate rst files for each blog post. I tried to see if there is any, but I’m not able to find. So, I decided to write one.

xml2rst.py

This is the simplest parser I wrote which only captures the title, pubdate and blogpost and creates rst file. This will not capture comments or tags.

#!/usr/bin/env python3

import os
import re
import sys
import time
import calendar
import xml.parsers.expat

item = 0
capturetitle = 0
capturepubdate = 0
capturedata = 0
ctitle = None
cpubdate = None
cdata = None

def start_element(name, attrs):
    global item
    global capturedata
    global capturetitle
    global capturepubdate
    global ctitle
    global cpubdate
    global cdata
    
    if name == "item":
        item = 1
    if item == 1 and name == "title":
        capturetitle = 1
        ctitle = ""
    if item == 1 and name == "pubDate":
        capturepubdate = 1
        cpubdate = ""
    if item == 1 and name == "content:encoded":
        capturedata = 1
        cdata = ""

def char_data(data):
    global capturedata
    global capturetitle
    global capturepubdate
    global ctitle
    global cpubdate
    global cdata

    if capturetitle == 1:
        ctitle += data
    if capturepubdate == 1:
        cpubdate += data
    if capturedata == 1:
        cdata += data

def end_element(name):
    global item
    global capturedata
    global capturetitle
    global capturepubdate
    global ctitle
    global cpubdate
    global cdata

    if item == 1 and name == "title":
        capturetitle = 0
    if item == 1 and name == "pubDate":
        capturepubdate = 0
    if item == 1 and name == "content:encoded":
        capturedata = 0
        
    if name == "item":
        item = 0
        if len(cdata) > 0:
            ptime = time.strptime(cpubdate, '%a, %d %b %Y %H:%M:%S %z')
            ptime_epoch = calendar.timegm(ptime)
            if not os.path.exists(str(ptime.tm_year)):
                os.makedirs(str(ptime.tm_year), 0o755)
            articlefile = ctitle.lower().replace(' ', '_')
            articlefile = re.sub('[^a-zA-Z0-9_]', '', articlefile)
            articlefile = str(ptime.tm_year) + '/' + str(ptime_epoch) + '_' + articlefile + '.rst'
            with open(articlefile, 'w') as article:
                article.write('.. title:: ' + ctitle)
                article.write('\n\n%s\n' % (''.join(['*' for n in range(len(ctitle))])))
                article.write(ctitle)
                article.write('\n%s\n' % (''.join(['*' for n in range(len(ctitle))])))
                article.write('\n| Updated: |modifieddate|\n\n')
                article.write(cdata)
            os.utime(articlefile, (ptime_epoch, ptime_epoch))


def main():
    parser = xml.parsers.expat.ParserCreate()
    parser.StartElementHandler = start_element
    parser.EndElementHandler = end_element
    parser.CharacterDataHandler = char_data
    parser.Parse(open(sys.argv[1]).read(), 1)

if __name__ == "__main__":
    main()

Manually fixing syntex Highlighting

Even though the parser helped generating rst files. I still have to manually fix the source code highlighting. This was boring and repeated work. but I thought instead of spending time to improve the parser, I just go through the generated rst files an fix it by hand, because i dint have lot of blog posts to fix.

To be continued..


1

https://wordpress.com

2

https://wordpress.com/support/export/