Exporting Blog posts from WordPress to Sphinx¶
Updated: May 07, 2024
Taking Backup from WordPress¶
WordPress [1] provides way to export [2] the entire blog as one single XML file, it also provides way to backup multimedia data in the same page where you take the blog backup. I took both the backups and kept it in my laptop. My entire blog is now in a xml file. I need to parse that file and generate rst files for each blog post. I tried to see if there is any, but I’m not able to find. So, I decided to write one.
xml2rst.py¶
This is the simplest parser I wrote which only captures the title
, pubdate
and blogpost
and creates rst file. This will not capture comments or tags.
#!/usr/bin/env python3
import os
import re
import sys
import time
import calendar
import xml.parsers.expat
item = 0
capturetitle = 0
capturepubdate = 0
capturedata = 0
ctitle = None
cpubdate = None
cdata = None
def start_element(name, attrs):
global item
global capturedata
global capturetitle
global capturepubdate
global ctitle
global cpubdate
global cdata
if name == "item":
item = 1
if item == 1 and name == "title":
capturetitle = 1
ctitle = ""
if item == 1 and name == "pubDate":
capturepubdate = 1
cpubdate = ""
if item == 1 and name == "content:encoded":
capturedata = 1
cdata = ""
def char_data(data):
global capturedata
global capturetitle
global capturepubdate
global ctitle
global cpubdate
global cdata
if capturetitle == 1:
ctitle += data
if capturepubdate == 1:
cpubdate += data
if capturedata == 1:
cdata += data
def end_element(name):
global item
global capturedata
global capturetitle
global capturepubdate
global ctitle
global cpubdate
global cdata
if item == 1 and name == "title":
capturetitle = 0
if item == 1 and name == "pubDate":
capturepubdate = 0
if item == 1 and name == "content:encoded":
capturedata = 0
if name == "item":
item = 0
if len(cdata) > 0:
ptime = time.strptime(cpubdate, '%a, %d %b %Y %H:%M:%S %z')
ptime_epoch = calendar.timegm(ptime)
if not os.path.exists(str(ptime.tm_year)):
os.makedirs(str(ptime.tm_year), 0o755)
articlefile = ctitle.lower().replace(' ', '_')
articlefile = re.sub('[^a-zA-Z0-9_]', '', articlefile)
articlefile = str(ptime.tm_year) + '/' + str(ptime_epoch) + '_' + articlefile + '.rst'
with open(articlefile, 'w') as article:
article.write('.. title:: ' + ctitle)
article.write('\n\n%s\n' % (''.join(['*' for n in range(len(ctitle))])))
article.write(ctitle)
article.write('\n%s\n' % (''.join(['*' for n in range(len(ctitle))])))
article.write('\n| Updated: |modifieddate|\n\n')
article.write(cdata)
os.utime(articlefile, (ptime_epoch, ptime_epoch))
def main():
parser = xml.parsers.expat.ParserCreate()
parser.StartElementHandler = start_element
parser.EndElementHandler = end_element
parser.CharacterDataHandler = char_data
parser.Parse(open(sys.argv[1]).read(), 1)
if __name__ == "__main__":
main()
Manually fixing syntex Highlighting¶
Even though the parser helped generating rst files. I still have to manually fix the source code highlighting. This was boring and repeated work. but I thought instead of spending time to improve the parser, I just go through the generated rst files an fix it by hand, because i dint have lot of blog posts to fix.
To be continued..