Pelican’s performance in rendering large blogs can be accelerated by transforming articles to HTML in parallel. For Markdown and restructured text (reST) I provide numbers and sample code here.
To explore the timings we need rather extensive samples of texts. On github we find a repository of German law texts as Markdown:
git clone https://github.com/bundestag/gesetze.git
And James Gardner’s “The Pylons Book” provides us with restructured text:
wget -r -l99 -A'*.txt' http://pylonsbook.com/
Both canons need some retouching before we can run Pelican’s readers on them. The law texts contain some field markup, which is fine, but the fields are enclosed in – lines. Our markdown processor does not like these. And the files of the Pylons Book contain special directives which we must mask for our standalone reST processor.
The code samples process some 30M characters, first sequential on a single processor, and then with 8 workers in parallel. See the numbers (Python 3.2).
Markdown:
35223891 chars, 1000 titles 35223891 chars, 1000 titles 71.9049699306488 17.336802005767822
reST:
33071060 chars, 460 titles 33071060 chars, 460 titles 73.695631980896 28.3037269115448
try_parallel_markdown.py
#!/usr/bin/env python
import os
from concurrent import futures
import timeit
from markdown import Markdown
INDIR = '../../../gesetze'
FILENAMES = []
for root, dirs, files in os.walk(INDIR):
for f in files:
if f.startswith('.') or not f.endswith('.md'):
continue
FILENAMES.append(os.path.join(os.path.abspath(root), f))
FILENAMES.sort()
MAX = 1000 #len(FILENAMES) - 1
def render(f):
a = []
head = 0
with open(f, 'r', encoding='utf-8') as fh:
for s in fh:
if head < 2:
if s.startswith('---'):
head += 1
continue
a.append(s)
md = Markdown(extensions=set(['codehilite', 'extra', 'meta']))
content = md.convert("".join(a))
return (content, md.Meta, )
def render_all():
chars = 0
titles = []
for content, metadata in map(render, FILENAMES[:MAX]):
chars += len(content)
try:
titles.append(metadata['title'])
except KeyError:
titles.append('~NO TITLE~')
print(chars, ' chars,', len(titles), ' titles')
def render_all_parallel():
chars = 0
titles = []
with futures.ProcessPoolExecutor(max_workers=8) as executor:
for content, metadata in executor.map(render, FILENAMES[:MAX]):
chars += len(content)
try:
titles.append(metadata['title'])
except KeyError:
titles.append('~NO TITLE~')
print(chars, ' chars,', len(titles), ' titles')
t = timeit.timeit('render_all()', number=1, setup="from __main__ import render_all")
tp = timeit.timeit('render_all_parallel()', number=1, setup="from __main__ import render_all_parallel")
print(t)
print(tp)
try_parallel_reST.py:
#!/usr/bin/env python
import os
from concurrent import futures
import timeit
import re
from pelican.readers import RstReader
INDIR = "pylonsbook.com/en/1.1/_sources"
OUTDIR = INDIR
FILENAMES = []
_TMP_FILENAMES = []
for root, dirs, files in os.walk(INDIR):
for f in files:
if f.startswith('.') or not f.endswith('.txt'):
continue
_TMP_FILENAMES.append(os.path.join(os.path.abspath(root), f))
_TMP_FILENAMES.sort()
for a in _TMP_FILENAMES:
b = a[:-3] + 'rst'
FILENAMES.append(b)
with open(a, 'r', encoding='utf-8') as ha:
with open(b, 'w', encoding='utf-8') as hb:
for sa in ha:
sb = re.sub(r'(\.\.\s+\w+\s*::)', r' "\1"', sa) # mask '.. index::'
sb = re.sub(r':\w+:`[^`]+`', '', sb) # mask ':foo:`bar`'
hb.write(sb)
MAX = len(FILENAMES) - 1
N = 20
def render(f):
r = RstReader({})
content, metadata = r.read(f)
return (content, metadata )
def render_all():
chars = 0
titles = []
for i in range(N):
for content, metadata in map(render, FILENAMES[:MAX]):
chars += len(content)
try:
titles.append(metadata['title'])
except KeyError:
titles.append('~NO TITLE~')
print(chars, ' chars,', len(titles), ' titles')
def render_all_parallel():
chars = 0
titles = []
for i in range(N):
with futures.ProcessPoolExecutor(max_workers=8) as executor:
for content, metadata in executor.map(render, FILENAMES[:MAX]):
chars += len(content)
try:
titles.append(metadata['title'])
except KeyError:
titles.append('~NO TITLE~')
print(chars, ' chars,', len(titles), ' titles')
t = timeit.timeit('render_all()', number=1, setup="from __main__ import render_all")
tp = timeit.timeit('render_all_parallel()', number=1, setup="from __main__ import render_all_parallel")
print(t)
print(tp)
Tests were performed on this machine:
Linux Morrigan 3.2.0-2-amd64 #1 SMP Mon Jun 11 17:24:18 UTC 2012 x86_64 GNU/Linux Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 On-line CPU(s) list: 0-7 Thread(s) per core: 2 Core(s) per socket: 4 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 26 Stepping: 5 CPU MHz: 1600.000 BogoMIPS: 6414.16 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 8192K NUMA node0 CPU(s): 0-7