String Extraction Benchmarkz

The purpose of this tests was to find out the fastest way to extract parts of a string, in this case the keyvalue pairs separated by ’;’ inside of one string.

All Tests are performed with the data:

  • “param:dt=123”
  • “param:dt=123; bla=blub; stichdatum=01.01.2019
  • “param:dt=123; bla=blub; stichdatum=01.01.2019; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub”

Benchmark 1: Spliting the string

The first test was performedby spliting the string and manually parsing those parts.
This is the approach seen quite often however is probably the worst one of those as I expect those two .Split() to hurt us badly

Code

    public static IEnumerable<ParamField> ExtractParamFieldSplit(string? inputString)
    {
        if (inputString is not { Length: > 6 })
            return Enumerable.Empty<ParamField>();
 
        var paramindex = inputString.IndexOf("param:",
                                             StringComparison.InvariantCultureIgnoreCase);
        if (paramindex == -1)
            return Enumerable.Empty<ParamField>();
 
        var splits = inputString[(paramindex + 6)..].Split(';');
        return splits.Select(x =>
        {
            var value = x.Split('=');
            return new ParamField
            {
                Key = value[0].Trim(),
                Value = value[1].Trim()
            };
        });
    }

Benchmark 2: Using Spans and Slices

While it’s not that fast using net48 there should be a smiliar performance for this small set of data with a way better performance the huger the set grows while using spans and ranges.
While there might not be a real performance benefit we’d expect at least some savings due to the reduced allocation.

Code

    public static IEnumerable<ParamField> ExtractParamFieldSpan(string? inputString)
    {
        if (inputString is not { Length: > 6 })
            return Enumerable.Empty<ParamField>();
 
        var identifierIndex = inputString.IndexOf("param:",
                                                  StringComparison.InvariantCultureIgnoreCase);
        if (identifierIndex == -1)
            return Enumerable.Empty<ParamField>();
 
        var inputAsSpan = inputString.AsSpan();
        var startIndex = identifierIndex + 6;
 
        var endOfFirstPair = inputAsSpan.IndexOf(';');
        var pair = endOfFirstPair > 1
            ? inputAsSpan.Slice(startIndex,
                                endOfFirstPair - startIndex)
            : inputAsSpan[startIndex..];
 
        var separatorIndex = pair.IndexOf('=');
        var results = new List<ParamField>();
        while (separatorIndex != -1)
        {
            results.Add(new ParamField
            {
                Key = pair[..separatorIndex].Trim().ToString(),
                Value = pair[(separatorIndex + 1)..].Trim().ToString()
            });
 
            pair = pair[(separatorIndex + 1)..];
            separatorIndex = pair.IndexOf('=');
        }
 
        return results;
    }

Benchmark 3: Using a Regex

While I’d argue that the code itself should be quite readable deciphering a regex sucks really bad. It’s however undeniable that using a regex in those cases most often performs best when used correctly and is less error prone.
Apart from that the allocation should be at about the same level as using spans

Code

This regex here however is pretty tiny and just consist of 2 capturing groups, the = separator, and a non-capturing group

  • (\w+)Any Alphanumerics to serve as key
  • = Separates Key and Value
  • (.*?) Well anything to be used as value
  • until (?:;|$) which is not captured and observes if a ; separates a new key value pair or the end of the input is reached
    private static readonly Regex regexRulez = new(@"(\w+)=(.*?)(?:;|$)", RegexOptions.Compiled);
    public static IEnumerable<ParamField> ExtractParamFieldRegex(string? inputString)
    {
        if (inputString is not { Length: > 6 })
            return Enumerable.Empty<ParamField>();
 
        var matches = regexRulez.Matches(inputString);
 
        return matches.Cast<Match>()
                      .Select(match => new ParamField
                      {
                          Key = match.Groups[1].Value.Trim(),
                          Value = match.Groups[2].Value.Trim()
                      });
    }
 

Results

BenchmarkDotNet v0.13.12, Windows 10 (10.0.19045.3930/22H2/2022Update)
12th Gen Intel Core i9-12900H, 1 CPU, 20 logical and 14 physical cores
.NET SDK 8.0.101
[Host]             : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
.NET 6.0           : .NET 6.0.26 (6.0.2623.60508), X64 RyuJIT AVX2
.NET 8.0           : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
.NET Framework 4.8 : .NET Framework 4.8 (4.8.4645.0), X64 RyuJIT VectorSize=256
MethodJobRuntimeMeanErrorStdDevGen0Gen1Allocated
SplitExtractSingle.NET 6.0.NET 6.048.98 ns0.568 ns0.532 ns0.0095-120 B
SplitExtractThree.NET 6.0.NET 6.085.61 ns1.679 ns3.070 ns0.0280-352 B
SplitExtractTwenty.NET 6.0.NET 6.0246.69 ns1.309 ns1.161 ns0.11970.00051504 B
SpanExtractSingle.NET 6.0.NET 6.064.67 ns1.321 ns1.852 ns0.0147-184 B
SpanExtractThree.NET 6.0.NET 6.062.77 ns1.208 ns1.342 ns0.0147-184 B
SpanExtractTwenty.NET 6.0.NET 6.062.30 ns1.237 ns1.270 ns0.0147-184 B
RegexExtractSingle.NET 6.0.NET 6.026.68 ns0.518 ns0.960 ns0.0115-144 B
RegexExtractThree.NET 6.0.NET 6.026.30 ns0.534 ns0.766 ns0.0115-144 B
RegexExtractTwenty.NET 6.0.NET 6.026.78 ns0.557 ns1.100 ns0.0115-144 B
SplitExtractSingle.NET 8.0.NET 8.037.63 ns0.759 ns0.844 ns0.0095-120 B
SplitExtractThree.NET 8.0.NET 8.063.43 ns0.232 ns0.194 ns0.0280-352 B
SplitExtractTwenty.NET 8.0.NET 8.0185.71 ns0.863 ns0.721 ns0.1197-1504 B
SpanExtractSingle.NET 8.0.NET 8.042.31 ns0.201 ns0.168 ns0.0147-184 B
SpanExtractThree.NET 8.0.NET 8.041.99 ns0.437 ns0.388 ns0.0147-184 B
SpanExtractTwenty.NET 8.0.NET 8.042.70 ns0.191 ns0.160 ns0.0147-184 B
RegexExtractSingle.NET 8.0.NET 8.019.96 ns0.388 ns0.363 ns0.0115-144 B
RegexExtractThree.NET 8.0.NET 8.018.94 ns0.169 ns0.150 ns0.0115-144 B
RegexExtractTwenty.NET 8.0.NET 8.018.86 ns0.138 ns0.115 ns0.0115-144 B
SplitExtractSingle.NET Framework 4.8.NET Framework 4.897.04 ns0.465 ns0.388 ns0.0343-217 B
SplitExtractThree.NET Framework 4.8.NET Framework 4.8146.96 ns0.422 ns0.330 ns0.0942-594 B
SplitExtractTwenty.NET Framework 4.8.NET Framework 4.8420.07 ns1.209 ns1.010 ns0.40910.00382576 B
SpanExtractSingle.NET Framework 4.8.NET Framework 4.8124.91 ns1.910 ns1.693 ns0.0305-193 B
SpanExtractThree.NET Framework 4.8.NET Framework 4.8125.28 ns2.491 ns2.447 ns0.0305-193 B
SpanExtractTwenty.NET Framework 4.8.NET Framework 4.8123.78 ns0.516 ns0.431 ns0.0305-193 B
RegexExtractSingle.NET Framework 4.8.NET Framework 4.867.39 ns0.365 ns0.305 ns0.0356-225 B
RegexExtractThree.NET Framework 4.8.NET Framework 4.867.28 ns0.347 ns0.307 ns0.0356-225 B
RegexExtractTwenty.NET Framework 4.8.NET Framework 4.867.41 ns0.669 ns0.626 ns0.0356-225 B

Analyzing stuff

This result matches my expectations to a great extent. Splitting sucks exponentially while becoming slower and slower.

The allocation represents only the allocation during the method calls and not the poco creation. I still wonder why regex performs worse than rangin on old net48 tho…

Testcode

Full SourceCode ```csharp using System.Text.RegularExpressions; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs;

namespace Benchmarkz;

[SimpleJob(RuntimeMoniker.Net48)] [SimpleJob(RuntimeMoniker.Net60)] [SimpleJob(RuntimeMoniker.Net80)] [MemoryDiagnoser] public class RangeBenchmarks { private static readonly Regex regexRulez = new(@”(\w+)=(.*?)(?:;|$)”, RegexOptions.Compiled);

private const string? JustOne = "param:dt=123";
private const string? Three = "param:dt=123; bla=blub; stichdatum=01.01.2019";
private const string? Twenty = "param:dt=123; bla=blub; stichdatum=01.01.2019; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub; bla=blub";

[Benchmark]
public void SplitExtractSingle() => ExtractParamFieldSplit(JustOne);

[Benchmark]
public void SplitExtractThree() => ExtractParamFieldSplit(Three);
[Benchmark]
public void SplitExtractTwenty() => ExtractParamFieldSplit(Twenty);

[Benchmark]
public void SpanExtractSingle() => ExtractParamFieldSpan(JustOne);

[Benchmark]
public void SpanExtractThree() => ExtractParamFieldSpan(Three);
[Benchmark]
public void SpanExtractTwenty() => ExtractParamFieldSpan(Twenty);

[Benchmark]
public void RegexExtractSingle() => ExtractParamFieldRegex(JustOne);

[Benchmark]
public void RegexExtractThree() => ExtractParamFieldRegex(Three);

[Benchmark]
public void RegexExtractTwenty() => ExtractParamFieldRegex(Twenty);

public static IEnumerable<ParamField> ExtractParamFieldSplit(string? inputString)
{
    if (inputString is not { Length: > 6 })
        return Enumerable.Empty<ParamField>();

    var paramindex = inputString.IndexOf("param:",
                                         StringComparison.InvariantCultureIgnoreCase);
    if (paramindex == -1)
        return Enumerable.Empty<ParamField>();

    var splits = inputString[(paramindex + 6)..].Split(';');
    return splits.Select(x =>
    {
        var value = x.Split('=');
        return new ParamField
        {
            Key = value[0].Trim(),
            Value = value[1].Trim()
        };
    });
}

public static IEnumerable<ParamField> ExtractParamFieldSpan(string? inputString)
{
    if (inputString is not { Length: > 6 })
        return Enumerable.Empty<ParamField>();

    var identifierIndex = inputString.IndexOf("param:",
                                              StringComparison.InvariantCultureIgnoreCase);
    if (identifierIndex == -1)
        return Enumerable.Empty<ParamField>();

    var inputAsSpan = inputString.AsSpan();
    var startIndex = identifierIndex + 6;

    var endOfFirstPair = inputAsSpan.IndexOf(';');
    var pair = endOfFirstPair > 1
        ? inputAsSpan.Slice(startIndex,
                            endOfFirstPair - startIndex)
        : inputAsSpan[startIndex..];

    var separatorIndex = pair.IndexOf('=');
    var results = new List<ParamField>();
    while (separatorIndex != -1)
    {
        results.Add(new ParamField
        {
            Key = pair[..separatorIndex].Trim().ToString(),
            Value = pair[(separatorIndex + 1)..].Trim().ToString()
        });

        pair = pair[(separatorIndex + 1)..];
        separatorIndex = pair.IndexOf('=');
    }

    return results;
}

public static IEnumerable<ParamField> ExtractParamFieldRegex(string? inputString)
{
    if (inputString is not { Length: > 6 })
        return Enumerable.Empty<ParamField>();

    var matches = regexRulez.Matches(inputString);

    return matches.Cast<Match>()
                  .Select(match => new ParamField
                  {
                      Key = match.Groups[1].Value.Trim(),
                      Value = match.Groups[2].Value.Trim()
                  });
}

public class ParamField
{
    public string Key { get; set; }
    public string Value { get; set; }
}

}


</details>