Faster strftime#

The Data#

from pandas import Series, date_range, Categorical, merge
s = Series(date_range('2000-01-01', periods=1_000_000, freq='5T'))

display(
    s.head(),
    f'{s.shape[0] = :,} | {s.dt.floor("D").nunique() = }'
)
0   2000-01-01 00:00:00
1   2000-01-01 00:05:00
2   2000-01-01 00:10:00
3   2000-01-01 00:15:00
4   2000-01-01 00:20:00
dtype: datetime64[ns]
's.shape[0] = 1,000,000 | s.dt.floor("D").nunique() = 3473'

And the goal is to end with something like:

s.head().dt.strftime('%Y%m%d')
0    20000101
1    20000101
2    20000101
3    20000101
4    20000101
dtype: object

The Timer#

from dataclasses import dataclass, field
from contextlib import contextmanager
from time import perf_counter

@dataclass
class Timer:
    start: float = None
    end: float = None
    
    @property
    def elapsed(self):
        if self.start is None or self.end is None:
            raise ValueError('Timer must have both end and start')
        return self.end - self.start

@dataclass
class TimerManager:
    registry: list = field(default_factory=list)
    
    @contextmanager
    def time(self, description):
        timer =  Timer(start=perf_counter())
        yield timer
        timer.end = perf_counter()
        self.registry.append((description, timer))
        print(f'{description:<30}{timer.end - timer.start:.6f}s')
        
timer, solutions = TimerManager(), []
with timer.time('pandas|.strftime'):               # ①
    solutions.append(
        s.dt.strftime('%Y%m%d')
    )

with timer.time('pandas|floor→dedupe→realign'):     # ②
    tmp = s.dt.floor('D').rename('date')
    solutions.append(
        tmp
        .set_axis(tmp)
        .drop_duplicates().dt.strftime('%Y%m%d')
        .reindex(tmp)
        .set_axis(s.index)
    )
    
with timer.time('pandas|floor→factorize→strftime'): # ③
    codes, cats = s.dt.floor('D').factorize()
    solutions.append(
        Categorical.from_codes(codes, categories=cats.strftime('%Y%m%d'))
    )
pandas|.strftime              3.503681s
pandas|floor→dedupe→realign   0.059551s
pandas|floor→factorize→strftime0.038854s
import numba

@numba.vectorize
def nanos_to_yyyymmdd_numba(nanos: int) -> int:
    """
    :WARNING: This is copy-pasted and pythonified from this C algorithm:
    <https://howardhinnant.github.io/date_algorithms.html#civil_from_days>.
    Tt has been only lightly checked over a range of ~20y for correct results.
    """
    z   = (nanos // 86400000000000) + 719468
    era = (z if z >= 0 else z - 146096) // 146097
    doe = z - era * 146097 # [0, 146096]
    yoe = (doe - doe // 1460 + doe // 36524 - doe // 146096) // 365 # [0, 399]
    y   = yoe + era * 400
    doy = doe - (365 * yoe + yoe // 4 - yoe // 100) # [0, 365]
    mp  = (5 * doy + 2) // 153 # [e, 11]
    d   = doy - (153 * mp + 2) // 5 + 1 #[1, 31]
    m   = mp + 3 if mp < 10 else mp-9
    y   = y + 1 if m <= 2 else y
    return y * 10_000 + m * 100 + d
    
with timer.time('numba|parts math'):      # ④
    array_nanos = s.dt.tz_localize(None).astype('int64').to_numpy()
    solutions.append(
        nanos_to_yyyymmdd_numba(array_nanos)
    )
    
with timer.time('pandas|parts math'):     # ⑤
    solutions.append(
        (s.dt.year*10_000 + s.dt.month*100 + s.dt.day)
    )
numba|parts math              0.485541s
pandas|parts math             0.096831s
from polars import from_pandas, col, Categorical as pl_Categorical

pl_df = from_pandas(s.to_frame('ts')).lazy()

with timer.time('polars|strftime'):         # ⑦
    solutions.append(
        pl_df.select(col('ts').dt.strftime('%Y%m%d'))
        .collect()
    )
solutions[-1] = solutions[-1].to_pandas()['ts']
    
with timer.time('polars|date→dedupe→join'): # ⑧
    linkage = pl_df.with_columns(date=col('ts').dt.date())
    solutions.append(
        linkage
        .unique('date')
        .with_columns(date_str=col('date').dt.strftime('%Y%m%d'))
        .join(linkage, on='date', how='inner')
        .collect()
    )
solutions[-1] = solutions[-1].to_pandas()['date_str']
polars|strftime               0.188251s
polars|date→dedupe→join       0.033758s
from itertools import pairwise
for s1, s2 in pairwise(solutions):
    assert (s1.astype(int) == s2.astype(int)).all()
from pandas import DataFrame

df = (
    DataFrame(timer.registry, columns=['description', 'timer'])
    .assign(
        elapsed_ms=lambda d: 1_000 * d['timer'].map(lambda t: t.elapsed),
        package=lambda d: d['description'].str.extract('(.*)\|'),
    )
    .drop(columns='timer')
    .sort_values('elapsed_ms', ascending=False)
)

df
description elapsed_ms package
0 pandas|.strftime 3503.681484 pandas
3 numba|parts math 485.540571 numba
5 polars|strftime 188.250758 polars
4 pandas|parts math 96.830522 pandas
1 pandas|floor→dedupe→realign 59.551314 pandas
2 pandas|floor→factorize→strftime 38.853838 pandas
6 polars|date→dedupe→join 33.757686 polars
%matplotlib inline
from matplotlib.pyplot import rc, setp
from flexitext import flexitext
import polars, pandas

rc('figure', figsize=(10, 6), facecolor='white')
rc('font', size=12)
rc('axes.spines', top=False, right=False, left=False)


palette = {
    'polars': '#1F77B4FF',
    'pandas': '#FF7F0EFF',
    'numba':  '#2CA02C',
}

ax = df.plot.barh(
    x='description', y='elapsed_ms', legend=False, width=.8,
    color=df['package'].map(palette),
)
ax.set_ylabel('')
ax.yaxis.set_tick_params(length=0)
ax.bar_label(ax.containers[0], fmt='{:.0f}ms', padding=5)
ax.set_xlabel(r'Duration (ms)')
ax.margins(y=0)

new_labels = []
for lab in ax.get_yticklabels():
    package, _, approach = lab.get_text().partition('|')
    lab.set(
        text=f'{package}{approach:>25}', 
        color=palette[package],
        size='large',
        fontfamily='monospace',
    )
    new_labels.append(lab)
ax.set_yticklabels(new_labels)

ax.figure.tight_layout()
left_x = min(text.get_tightbbox().x0 for text in ax.get_yticklabels())
x,_ = ax.transAxes.inverted().transform([left_x, 0])

annot = flexitext(
    s='<size:x-large,weight:semibold>'
        'Time Elapsed for 1 million row datetime→date string conversion\n'
      '</>'
      '<size:large>'
         f'    <color:{palette["pandas"]}>pandas {pandas.__version__}</>'
         f' vs <color:{palette["numba"]}>numba {numba.__version__}</>'
         f' vs <color:{palette["polars"]}>Polars {polars.__version__}</>'
      '</>',
    x=x, y=1.02, va='bottom', ha='left',
);
../_images/51e0498737c61da4f86c32b4ef12101fea651640a1e912626ef5636d440b4835.png