Concurrent Processing In Pelican

Pelican’s performance in rendering large blogs can be accelerated by transforming articles to HTML in parallel. For Markdown and restructured text (reST) I provide numbers and sample code here.

To explore the timings we need rather extensive samples of texts. On github we find a repository of German law texts as Markdown:

git clone https://github.com/bundestag/gesetze.git

And James Gardner’s “The Pylons Book” provides us with restructured text:

wget -r -l99 -A'*.txt' http://pylonsbook.com/

Both canons need some retouching before we can run Pelican’s readers on them. The law texts contain some field markup, which is fine, but the fields are enclosed in lines. Our markdown processor does not like these. And the files of the Pylons Book contain special directives which we must mask for our standalone reST processor.

The code samples process some 30M characters, first sequential on a single processor, and then with 8 workers in parallel. See the numbers (Python 3.2).

Markdown:

35223891  chars, 1000  titles
35223891  chars, 1000  titles
71.9049699306488
17.336802005767822

reST:

33071060  chars, 460  titles
33071060  chars, 460  titles
73.695631980896
28.3037269115448

try_parallel_markdown.py

#!/usr/bin/env python

import os
from concurrent import futures
import timeit
from markdown import Markdown


INDIR = '../../../gesetze'
FILENAMES = []
for root, dirs, files in os.walk(INDIR):
    for f in files:
        if f.startswith('.') or not f.endswith('.md'):
            continue
        FILENAMES.append(os.path.join(os.path.abspath(root), f))
FILENAMES.sort()
MAX = 1000 #len(FILENAMES) - 1

def render(f):
    a = []
    head = 0
    with open(f, 'r', encoding='utf-8') as fh:
        for s in fh:
            if head < 2:
                if s.startswith('---'):
                    head += 1
                    continue
            a.append(s)
    md = Markdown(extensions=set(['codehilite', 'extra', 'meta']))
    content = md.convert("".join(a))
    return (content, md.Meta, )


def render_all():
    chars = 0
    titles = []
    for content, metadata in map(render, FILENAMES[:MAX]):
        chars += len(content)
        try:
            titles.append(metadata['title'])
        except KeyError:
            titles.append('~NO TITLE~')
    print(chars, ' chars,', len(titles), ' titles')

def render_all_parallel():
    chars = 0
    titles = []
    with futures.ProcessPoolExecutor(max_workers=8) as executor:
        for content, metadata in executor.map(render, FILENAMES[:MAX]):
            chars += len(content)
            try:
                titles.append(metadata['title'])
            except KeyError:
                titles.append('~NO TITLE~')
    print(chars, ' chars,', len(titles), ' titles')


t = timeit.timeit('render_all()', number=1, setup="from __main__ import render_all")
tp = timeit.timeit('render_all_parallel()', number=1, setup="from __main__ import render_all_parallel")
print(t)
print(tp)

try_parallel_reST.py:

#!/usr/bin/env python

import os
from concurrent import futures
import timeit
import re
from pelican.readers import RstReader


INDIR = "pylonsbook.com/en/1.1/_sources"
OUTDIR = INDIR
FILENAMES = []
_TMP_FILENAMES = []
for root, dirs, files in os.walk(INDIR):
    for f in files:
        if f.startswith('.') or not f.endswith('.txt'):
            continue
        _TMP_FILENAMES.append(os.path.join(os.path.abspath(root), f))
_TMP_FILENAMES.sort()

for a in _TMP_FILENAMES:
    b = a[:-3] + 'rst'
    FILENAMES.append(b)
    with open(a, 'r', encoding='utf-8') as ha:
        with open(b, 'w', encoding='utf-8') as hb:
            for sa in ha:
                sb = re.sub(r'(\.\.\s+\w+\s*::)', r'   "\1"', sa)  # mask '.. index::'
                sb = re.sub(r':\w+:`[^`]+`', '', sb)               # mask ':foo:`bar`'
                hb.write(sb)

MAX = len(FILENAMES) - 1
N = 20


def render(f):
    r = RstReader({})
    content, metadata = r.read(f)
    return (content, metadata )


def render_all():
    chars = 0
    titles = []
    for i in range(N):
        for content, metadata in map(render, FILENAMES[:MAX]):
            chars += len(content)
            try:
                titles.append(metadata['title'])
            except KeyError:
                titles.append('~NO TITLE~')
    print(chars, ' chars,', len(titles), ' titles')

def render_all_parallel():
    chars = 0
    titles = []
    for i in range(N):
        with futures.ProcessPoolExecutor(max_workers=8) as executor:
            for content, metadata in executor.map(render, FILENAMES[:MAX]):
                chars += len(content)
                try:
                    titles.append(metadata['title'])
                except KeyError:
                    titles.append('~NO TITLE~')
    print(chars, ' chars,', len(titles), ' titles')


t = timeit.timeit('render_all()', number=1, setup="from __main__ import render_all")
tp = timeit.timeit('render_all_parallel()', number=1, setup="from __main__ import render_all_parallel")
print(t)
print(tp)

Tests were performed on this machine:

Linux Morrigan 3.2.0-2-amd64 #1 SMP Mon Jun 11 17:24:18 UTC 2012 x86_64 GNU/Linux

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
On-line CPU(s) list:   0-7
Thread(s) per core:    2
Core(s) per socket:    4
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 26
Stepping:              5
CPU MHz:               1600.000
BogoMIPS:              6414.16
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              8192K
NUMA node0 CPU(s):     0-7