Python file search benchmark

Out of curiosity, I did a quick benchmark of a few variations of Python code that searches 10,000 file names for the lexicographically last one matching a specific pattern. I measured the average time of 100 trials for each variation.

Here’s the result:

without max:
        0.0013002311 seconds
with max:
        0.0001507549 seconds
with max and license:
        0.0014262455 seconds

In this case, Python is fast enough that it doesn’t matter which approach is used. It’s best to avoid premature optimization.

Below is the code that gave the results above. Part of why code like this is sometimes necessary is that functions like os.listdir return entries in an arbitrary order.

import random
import re
import time


pattern: re.Pattern = re.compile(r"^(\d\d\d)_\w+\.sql$")

file_names: list[str] = []
for i in range(10000):
    rand_s: str = ""
    for _ in range(20):
        rand_s += chr(random.randint(ord("a"), ord("z")))
    file_names.append(str(i).zfill(3) + "_" + rand_s + ".sql")


def main():
    n: int = 100

    total1: float = 0
    for _ in range(n):
        start1: float = time.perf_counter()
        _ = without_max()
        end1: float = time.perf_counter()
        total1 += end1 - start1

    print(f"without max:\n\t{total1 / n:.10f} seconds")

    total2: float = 0
    for _ in range(n):
        start2: float = time.perf_counter()
        _ = with_max()
        end2: float = time.perf_counter()
        total2 += end2 - start2

    print(f"with max:\n\t{total2 / n:.10f} seconds")

    file_names.append("LICENSE")
    total3: float = 0
    for _ in range(n):
        start3: float = time.perf_counter()
        _ = with_max()
        end3: float = time.perf_counter()
        total3 += end3 - start3

    print(f"with max and license:\n\t{total3 / n:.10f} seconds")


def without_max() -> int | None:
    latest: int = -1
    for entry in file_names:
        match: re.Match | None = pattern.match(entry)
        if match:
            v: int = int(match.group(1))
            if v > latest:
                latest = v

    if latest == -1:
        return None
    return latest


def with_max() -> int | None:
    entry: str = max(file_names)
    match: re.Match | None = pattern.match(entry)
    if match:
        return int(match.group(1))
    print(end="")

    latest: int = -1
    for entry in file_names:
        match: re.Match | None = pattern.match(entry)
        if match:
            v: int = int(match.group(1))
            if v > latest:
                latest = v

    if latest == -1:
        return None
    return latest


main()