# run_benchmark_v4.ps1 # OneCharacterCode V4 benchmark - PowerShell 5.1 compatible. # # THREE SEPARATE TESTS, REPORTED SEPARATELY: # # Mode A Standalone file compression. # For each file set: encode each file with its own per-file # dictionary (V3-style) and sum. Compare against gzip. # # Mode B Persistent shared dictionary. # Build ONE dictionary from a training subset. The # dictionary is "installed" once. Subsequent files are # encoded as body-only symbolic packets that reference # the installed dictionary. The dictionary is NOT # re-transmitted. Cumulative wire bytes are compared # against the cumulative cost of downloading each file # in raw form. # # Mode C Delta updates. # For consecutive file pairs in a versioned series, the # sender transmits only the changed middle bytes # (between longest common prefix and longest common # suffix), inside an OCC carrier. The receiver applies # the delta to its previously-installed copy and the # result must SHA-256-match. # # All three modes verify reconstruction by SHA-256 round-trip. No # claim of an OCC win is made unless the math AND the round-trip # both pass. $ErrorActionPreference = 'Stop' $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path $inputsRoot = Join-Path $scriptDir 'inputs' $outputsDir = Join-Path $scriptDir 'outputs' if (-not (Test-Path $outputsDir)) { New-Item -ItemType Directory -Path $outputsDir | Out-Null } # ---------------------------------------------------------------------- # Token table (same shape as V3) # ---------------------------------------------------------------------- $TIER1_BYTES = @(0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08, 0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x12,0x13) $TIER2_ESC_A = 0x14 $TIER2_ESC_B = 0x15 $TIER3_ESC = 0x16 $LITERAL_ESC = 0x17 $RESERVED = New-Object 'System.Collections.Generic.HashSet[byte]' foreach ($b in $TIER1_BYTES) { [void]$RESERVED.Add([byte]$b) } [void]$RESERVED.Add([byte]$TIER2_ESC_A) [void]$RESERVED.Add([byte]$TIER2_ESC_B) [void]$RESERVED.Add([byte]$TIER3_ESC) [void]$RESERVED.Add([byte]$LITERAL_ESC) $PLACEHOLDER_FIRST = 0xE000 # ---------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------- function Get-Sha256Hex([byte[]]$bytes) { $sha = [System.Security.Cryptography.SHA256]::Create() try { ($sha.ComputeHash($bytes) | ForEach-Object { $_.ToString('x2') }) -join '' } finally { $sha.Dispose() } } function Gzip-Bytes([byte[]]$bytes) { $ms = New-Object System.IO.MemoryStream $gz = New-Object System.IO.Compression.GZipStream($ms, [System.IO.Compression.CompressionMode]::Compress) $gz.Write($bytes, 0, $bytes.Length); $gz.Close() $out = $ms.ToArray(); $ms.Dispose(); return $out } function Bytes-To-Latin1String([byte[]]$bytes) { [System.Text.Encoding]::GetEncoding(28591).GetString($bytes) } function Latin1String-To-Bytes([string]$s) { [System.Text.Encoding]::GetEncoding(28591).GetBytes($s) } function Count-Occurrences([string]$haystack, [string]$needle) { if ([string]::IsNullOrEmpty($needle)) { return 0 } $count = 0; $idx = 0; $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { break } $count++; $idx = $found + $nlen } return $count } function Replace-All-NonOverlap([string]$haystack, [string]$needle, [string]$replacement) { if ([string]::IsNullOrEmpty($needle)) { return $haystack } $sb = New-Object System.Text.StringBuilder $idx = 0; $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { [void]$sb.Append($haystack.Substring($idx)); break } if ($found -gt $idx) { [void]$sb.Append($haystack.Substring($idx, $found - $idx)) } [void]$sb.Append($replacement) $idx = $found + $nlen } return $sb.ToString() } # ---------------------------------------------------------------------- # Build a dictionary (list of accepted phrases) from arbitrary input bytes. # Single-pass scan version for speed on large inputs. # ---------------------------------------------------------------------- function Build-Dictionary { param( [byte[]]$inputBytes, [int[]]$phraseLengths, [int]$maxEntries ) $working = Bytes-To-Latin1String $inputBytes $accepted = New-Object 'System.Collections.Generic.List[string]' $tentativeTokenCost = 2 $placeholderIdx = 0 $maxIterations = $maxEntries + 4 for ($iter = 0; $iter -lt $maxIterations; $iter++) { if ($accepted.Count -ge $maxEntries) { break } $bestText = $null; $bestNet = 0 foreach ($L in $phraseLengths) { if ($L -lt 3) { continue } if ($working.Length -lt ($L * 2)) { continue } $counts = New-Object 'System.Collections.Generic.Dictionary[string,int]' $upper = $working.Length - $L for ($i = 0; $i -le $upper; $i++) { $sub = $working.Substring($i, $L) $hasPlaceholder = $false for ($k = 0; $k -lt $L; $k++) { if ([int]$sub[$k] -ge 0xE000) { $hasPlaceholder = $true; break } } if ($hasPlaceholder) { continue } if ($counts.ContainsKey($sub)) { $counts[$sub] = $counts[$sub] + 1 } else { $counts[$sub] = 1 } } foreach ($kv in $counts.GetEnumerator()) { if ($kv.Value -lt 2) { continue } $net = (($L - $tentativeTokenCost) * $kv.Value) - (1 + $L) if ($net -gt $bestNet) { $bestNet = $net; $bestText = $kv.Key } } } if ($null -eq $bestText -or $bestNet -le 0) { break } if ($placeholderIdx -gt 0x1FFF) { break } $phChar = [char]($PLACEHOLDER_FIRST + $placeholderIdx); $placeholderIdx++ $accepted.Add($bestText) $working = Replace-All-NonOverlap $working $bestText ([string]$phChar) } # Recount occurrences in the original (in acceptance order, replay style) $replay = Bytes-To-Latin1String $inputBytes $entryInfo = New-Object 'System.Collections.Generic.List[object]' $phIdx = 0 foreach ($text in $accepted) { $count = Count-Occurrences $replay $text $ph = [char]($PLACEHOLDER_FIRST + $phIdx); $phIdx++ $replay = Replace-All-NonOverlap $replay $text ([string]$ph) $entryInfo.Add(@{ Text = $text; Length = $text.Length; Count = $count }) } # Sort by raw bytes saved (Length * Count) descending - this is the # tier assignment order. $ranked = @($entryInfo | Sort-Object -Descending { $_.Length * $_.Count }) return $ranked } # ---------------------------------------------------------------------- # Assign tier token bytes given a dictionary list (already ordered best-first). # Returns: list of @{ Text; TokenBytes; Tier } # ---------------------------------------------------------------------- function Assign-Tiers { param( [object[]]$rankedEntries, [int]$maxTier1 = 16, [int]$maxTier2 = 512, [int]$maxTier3 = 256 ) $out = New-Object 'System.Collections.Generic.List[object]' for ($r = 0; $r -lt $rankedEntries.Count; $r++) { $e = $rankedEntries[$r] $tier = 0; $tb = $null if ($r -lt $maxTier1) { $tier = 1; $tb = @([byte]$TIER1_BYTES[$r]) } elseif ($r -lt ($maxTier1 + $maxTier2)) { $tier = 2 $t2idx = $r - $maxTier1 if ($t2idx -lt 256) { $tb = @([byte]$TIER2_ESC_A, [byte]$t2idx) } else { $tb = @([byte]$TIER2_ESC_B, [byte]($t2idx - 256)) } } elseif ($r -lt ($maxTier1 + $maxTier2 + $maxTier3)) { $tier = 3 $t3idx = $r - $maxTier1 - $maxTier2 $lo = [byte]($t3idx -band 0xFF); $hi = [byte](($t3idx -shr 8) -band 0xFF) $tb = @([byte]$TIER3_ESC, $lo, $hi) } else { $tier = 0; $tb = $null } [void]$out.Add(@{ Text = $e.Text; Length = $e.Length; Count = $e.Count; Tier = $tier; TokenBytes = $tb }) } return $out } # ---------------------------------------------------------------------- # Encode body bytes using a pre-built tier dictionary. # Returns: byte[] # ---------------------------------------------------------------------- function Encode-Body-WithTiers { param([byte[]]$inputBytes, [object[]]$tierDict) $bodyStr = Bytes-To-Latin1String $inputBytes $tokenLookup = @{} $placeholderIdx = 0 foreach ($e in $tierDict) { if ($null -eq $e.TokenBytes) { continue } $ph = [char]($PLACEHOLDER_FIRST + $placeholderIdx); $placeholderIdx++ $bodyStr = Replace-All-NonOverlap $bodyStr $e.Text ([string]$ph) $tokenLookup[$ph] = $e.TokenBytes } $bodyOut = New-Object System.Collections.Generic.List[byte] for ($i = 0; $i -lt $bodyStr.Length; $i++) { $ch = $bodyStr[$i]; $code = [int]$ch if ($code -ge $PLACEHOLDER_FIRST) { $tb = $tokenLookup[$ch] foreach ($b in $tb) { [void]$bodyOut.Add([byte]$b) } } else { $b = [byte]$code if ($RESERVED.Contains($b)) { [void]$bodyOut.Add([byte]$LITERAL_ESC); [void]$bodyOut.Add($b) } else { [void]$bodyOut.Add($b) } } } return $bodyOut.ToArray() } # ---------------------------------------------------------------------- # Build a full self-contained V4 carrier (dictionary + body). # Magic "OCC4". Returns: byte[] # ---------------------------------------------------------------------- function Build-Carrier-FullDict { param([byte[]]$inputBytes, [object[]]$tierDict) $body = Encode-Body-WithTiers -inputBytes $inputBytes -tierDict $tierDict $t1 = @($tierDict | Where-Object { $_.Tier -eq 1 }) $t2 = @($tierDict | Where-Object { $_.Tier -eq 2 }) $t3 = @($tierDict | Where-Object { $_.Tier -eq 3 }) $hdr = New-Object System.Collections.Generic.List[byte] foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC4'))) { [void]$hdr.Add($c) } [void]$hdr.Add([byte]$t1.Count) foreach ($e in $t1) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t2.Count -band 0xFF)); [void]$hdr.Add([byte](($t2.Count -shr 8) -band 0xFF)) foreach ($e in $t2) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t3.Count -band 0xFF)); [void]$hdr.Add([byte](($t3.Count -shr 8) -band 0xFF)) foreach ($e in $t3) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } $bl = $body.Length [void]$hdr.Add([byte]($bl -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 8) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 16) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 24) -band 0xFF)) foreach ($x in $body) { [void]$hdr.Add($x) } return $hdr.ToArray() } # ---------------------------------------------------------------------- # Serialize a tier dictionary alone (no body) - used for the installed # persistent dictionary file. Magic "OCC4D". # ---------------------------------------------------------------------- function Serialize-Dictionary { param([object[]]$tierDict) $t1 = @($tierDict | Where-Object { $_.Tier -eq 1 }) $t2 = @($tierDict | Where-Object { $_.Tier -eq 2 }) $t3 = @($tierDict | Where-Object { $_.Tier -eq 3 }) $hdr = New-Object System.Collections.Generic.List[byte] foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC4D'))) { [void]$hdr.Add($c) } [void]$hdr.Add([byte]$t1.Count) foreach ($e in $t1) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t2.Count -band 0xFF)); [void]$hdr.Add([byte](($t2.Count -shr 8) -band 0xFF)) foreach ($e in $t2) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t3.Count -band 0xFF)); [void]$hdr.Add([byte](($t3.Count -shr 8) -band 0xFF)) foreach ($e in $t3) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } return $hdr.ToArray() } # Build a body-only packet that references an already-installed dictionary. # Magic "OCC4P". Payload = 4-byte body length + body bytes. function Build-Packet-BodyOnly { param([byte[]]$inputBytes, [object[]]$tierDict) $body = Encode-Body-WithTiers -inputBytes $inputBytes -tierDict $tierDict $out = New-Object System.Collections.Generic.List[byte] foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC4P'))) { [void]$out.Add($c) } $bl = $body.Length [void]$out.Add([byte]($bl -band 0xFF)) [void]$out.Add([byte](($bl -shr 8) -band 0xFF)) [void]$out.Add([byte](($bl -shr 16) -band 0xFF)) [void]$out.Add([byte](($bl -shr 24) -band 0xFF)) foreach ($x in $body) { [void]$out.Add($x) } return $out.ToArray() } # ---------------------------------------------------------------------- # Parse a tier dictionary from its serialized form. # Returns: list of @{ Text; Tier; TokenBytes } # ---------------------------------------------------------------------- function Parse-Dictionary([byte[]]$dictBytes) { if ($dictBytes.Length -lt 5) { throw "dict too short" } if ([System.Text.Encoding]::ASCII.GetString($dictBytes, 0, 5) -ne 'OCC4D') { throw "bad dict magic" } $i = 5 $entries = New-Object 'System.Collections.Generic.List[object]' $t1c = $dictBytes[$i]; $i++ for ($k = 0; $k -lt $t1c; $k++) { $len = $dictBytes[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($dictBytes, $i, $len) $entries.Add(@{ Text = $text; Tier = 1; TokenBytes = @([byte]$TIER1_BYTES[$k]) }) $i += $len } $t2c = [int]$dictBytes[$i] -bor ([int]$dictBytes[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t2c; $k++) { $len = $dictBytes[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($dictBytes, $i, $len) $tb = if ($k -lt 256) { @([byte]$TIER2_ESC_A, [byte]$k) } else { @([byte]$TIER2_ESC_B, [byte]($k - 256)) } $entries.Add(@{ Text = $text; Tier = 2; TokenBytes = $tb }) $i += $len } $t3c = [int]$dictBytes[$i] -bor ([int]$dictBytes[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t3c; $k++) { $len = $dictBytes[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($dictBytes, $i, $len) $lo = [byte]($k -band 0xFF); $hi = [byte](($k -shr 8) -band 0xFF) $entries.Add(@{ Text = $text; Tier = 3; TokenBytes = @([byte]$TIER3_ESC, $lo, $hi) }) $i += $len } return ,$entries.ToArray() } # ---------------------------------------------------------------------- # Decode a body buffer using a parsed dictionary. # ---------------------------------------------------------------------- function Decode-Body { param([byte[]]$body, [object[]]$dict) $t1Texts = New-Object 'System.Collections.Generic.List[string]' $t2Texts = New-Object 'System.Collections.Generic.List[string]' $t3Texts = New-Object 'System.Collections.Generic.List[string]' foreach ($e in $dict) { if ($e.Tier -eq 1) { $t1Texts.Add($e.Text) } elseif ($e.Tier -eq 2) { $t2Texts.Add($e.Text) } elseif ($e.Tier -eq 3) { $t3Texts.Add($e.Text) } } $out = New-Object System.Collections.Generic.List[byte] $j = 0 while ($j -lt $body.Length) { $b = $body[$j] $isT1 = $false for ($t = 0; $t -lt $TIER1_BYTES.Count; $t++) { if ($b -eq [byte]$TIER1_BYTES[$t]) { if ($t -lt $t1Texts.Count) { $bb = Latin1String-To-Bytes $t1Texts[$t] foreach ($x in $bb) { [void]$out.Add($x) } $isT1 = $true } break } } if ($isT1) { $j++; continue } if ($b -eq $TIER2_ESC_A) { $idx = [int]$body[$j+1] if ($idx -lt $t2Texts.Count) { $bb = Latin1String-To-Bytes $t2Texts[$idx]; foreach ($x in $bb) { [void]$out.Add($x) } } $j += 2; continue } if ($b -eq $TIER2_ESC_B) { $idx = [int]$body[$j+1] + 256 if ($idx -lt $t2Texts.Count) { $bb = Latin1String-To-Bytes $t2Texts[$idx]; foreach ($x in $bb) { [void]$out.Add($x) } } $j += 2; continue } if ($b -eq $TIER3_ESC) { $idx = [int]$body[$j+1] -bor ([int]$body[$j+2] -shl 8) if ($idx -lt $t3Texts.Count) { $bb = Latin1String-To-Bytes $t3Texts[$idx]; foreach ($x in $bb) { [void]$out.Add($x) } } $j += 3; continue } if ($b -eq $LITERAL_ESC) { [void]$out.Add($body[$j+1]); $j += 2; continue } [void]$out.Add($b); $j++ } return $out.ToArray() } # Decode a self-contained "OCC4" carrier. function Decode-Carrier-FullDict([byte[]]$enc) { if ([System.Text.Encoding]::ASCII.GetString($enc, 0, 4) -ne 'OCC4') { throw "bad magic" } $i = 4 $entries = New-Object 'System.Collections.Generic.List[object]' $t1c = $enc[$i]; $i++ for ($k = 0; $k -lt $t1c; $k++) { $len = $enc[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($enc, $i, $len) $entries.Add(@{ Text = $text; Tier = 1 }); $i += $len } $t2c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t2c; $k++) { $len = $enc[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($enc, $i, $len) $entries.Add(@{ Text = $text; Tier = 2 }); $i += $len } $t3c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t3c; $k++) { $len = $enc[$i]; $i++ $text = [System.Text.Encoding]::GetEncoding(28591).GetString($enc, $i, $len) $entries.Add(@{ Text = $text; Tier = 3 }); $i += $len } $bl = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8) -bor ([int]$enc[$i+2] -shl 16) -bor ([int]$enc[$i+3] -shl 24); $i += 4 $body = New-Object byte[] $bl [Array]::Copy($enc, $i, $body, 0, $bl) return (Decode-Body -body $body -dict $entries) } # Decode an "OCC4P" body-only packet using an external dict. function Decode-Packet([byte[]]$pkt, [object[]]$dict) { if ([System.Text.Encoding]::ASCII.GetString($pkt, 0, 5) -ne 'OCC4P') { throw "bad pkt magic" } $i = 5 $bl = [int]$pkt[$i] -bor ([int]$pkt[$i+1] -shl 8) -bor ([int]$pkt[$i+2] -shl 16) -bor ([int]$pkt[$i+3] -shl 24); $i += 4 $body = New-Object byte[] $bl [Array]::Copy($pkt, $i, $body, 0, $bl) return (Decode-Body -body $body -dict $dict) } # ---------------------------------------------------------------------- # Delta encoder/decoder # ---------------------------------------------------------------------- # Computes longest common prefix and longest common suffix between two # byte arrays. Encodes only the middle (changed section) as OCC4 inside # a wrapper: # # magic "OCC4X" (5 bytes) # prev_len 4 bytes LE # prefix 4 bytes LE (length of common prefix) # suffix 4 bytes LE (length of common suffix) # mid_len 4 bytes LE (length of OCC4 carrier holding middle bytes) # mid_bytes mid_len bytes (OCC4 carrier) # # Receiver: take prev[0..prefix-1] + decode(mid_bytes) + prev[len-suffix..end] function Build-Delta { param([byte[]]$prev, [byte[]]$next, [object[]]$tierDict) $plen = $prev.Length; $nlen = $next.Length $maxPrefix = [Math]::Min($plen, $nlen) $prefix = 0 while ($prefix -lt $maxPrefix -and $prev[$prefix] -eq $next[$prefix]) { $prefix++ } $maxSuffix = [Math]::Min($plen - $prefix, $nlen - $prefix) $suffix = 0 while ($suffix -lt $maxSuffix -and $prev[$plen - 1 - $suffix] -eq $next[$nlen - 1 - $suffix]) { $suffix++ } $midLen = $nlen - $prefix - $suffix $mid = New-Object byte[] $midLen if ($midLen -gt 0) { [Array]::Copy($next, $prefix, $mid, 0, $midLen) } # OCC-encode the middle using the installed dictionary $midCarrier = Build-Packet-BodyOnly -inputBytes $mid -tierDict $tierDict $out = New-Object System.Collections.Generic.List[byte] foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC4X'))) { [void]$out.Add($c) } foreach ($v in @($plen, $prefix, $suffix, $midCarrier.Length)) { [void]$out.Add([byte]($v -band 0xFF)) [void]$out.Add([byte](($v -shr 8) -band 0xFF)) [void]$out.Add([byte](($v -shr 16) -band 0xFF)) [void]$out.Add([byte](($v -shr 24) -band 0xFF)) } foreach ($x in $midCarrier) { [void]$out.Add($x) } return $out.ToArray() } function Apply-Delta { param([byte[]]$prev, [byte[]]$delta, [object[]]$tierDict) if ([System.Text.Encoding]::ASCII.GetString($delta, 0, 5) -ne 'OCC4X') { throw "bad delta magic" } $i = 5 $plen = [int]$delta[$i] -bor ([int]$delta[$i+1] -shl 8) -bor ([int]$delta[$i+2] -shl 16) -bor ([int]$delta[$i+3] -shl 24); $i += 4 $prefix = [int]$delta[$i] -bor ([int]$delta[$i+1] -shl 8) -bor ([int]$delta[$i+2] -shl 16) -bor ([int]$delta[$i+3] -shl 24); $i += 4 $suffix = [int]$delta[$i] -bor ([int]$delta[$i+1] -shl 8) -bor ([int]$delta[$i+2] -shl 16) -bor ([int]$delta[$i+3] -shl 24); $i += 4 $midLen = [int]$delta[$i] -bor ([int]$delta[$i+1] -shl 8) -bor ([int]$delta[$i+2] -shl 16) -bor ([int]$delta[$i+3] -shl 24); $i += 4 $midCarrier = New-Object byte[] $midLen [Array]::Copy($delta, $i, $midCarrier, 0, $midLen) $mid = Decode-Packet -pkt $midCarrier -dict $tierDict $out = New-Object System.Collections.Generic.List[byte] for ($k = 0; $k -lt $prefix; $k++) { [void]$out.Add($prev[$k]) } foreach ($x in $mid) { [void]$out.Add($x) } $startSuffix = $prev.Length - $suffix for ($k = $startSuffix; $k -lt $prev.Length; $k++) { [void]$out.Add($prev[$k]) } return $out.ToArray() } # ---------------------------------------------------------------------- # Mode A: per-file standalone compression for a file set. # Sums raw bytes, gzip(raw) bytes, OCC V4 bytes across all files. # Validates round-trip per file. # ---------------------------------------------------------------------- function Run-Mode-A { param([string]$setName, [System.IO.FileInfo[]]$files, [int[]]$phraseLens) Write-Host ("Mode A on $setName ($($files.Count) files)...") $rawTotal = 0; $gzipTotal = 0; $occTotal = 0 $allPass = $true $perFile = New-Object 'System.Collections.Generic.List[object]' $i = 0 foreach ($f in $files) { $i++ $raw = [System.IO.File]::ReadAllBytes($f.FullName) $rawSha = Get-Sha256Hex $raw $gz = Gzip-Bytes $raw # Build a per-file dictionary $ranked = Build-Dictionary -inputBytes $raw -phraseLengths $phraseLens -maxEntries 256 $tier = Assign-Tiers -rankedEntries $ranked -maxTier1 16 -maxTier2 240 -maxTier3 0 $carrier = Build-Carrier-FullDict -inputBytes $raw -tierDict $tier $pass = $false try { $rec = Decode-Carrier-FullDict $carrier; $pass = ((Get-Sha256Hex $rec) -eq $rawSha) } catch { $pass = $false } if (-not $pass) { $allPass = $false } $rawTotal += $raw.Length $gzipTotal += $gz.Length $occTotal += $carrier.Length if ($i % 25 -eq 0 -or $i -eq $files.Count) { Write-Host (" [{0}/{1}] running totals raw={2} gz={3} occ={4}" -f $i, $files.Count, $rawTotal, $gzipTotal, $occTotal) } } $winner = if ($gzipTotal -le $occTotal) { 'Gzip(raw)' } else { 'OCC V4' } $msg = if ($winner -eq 'Gzip(raw)') { 'Gzip(raw) still wins for this set.' } else { 'OCC V4 wins for this set (cumulative bytes).' } return [pscustomobject]@{ file_set = $setName file_count = $files.Count raw_bytes = $rawTotal gzip_raw_bytes = $gzipTotal occ_v4_bytes = $occTotal winner = $winner winner_message = $msg reconstruction_status = if ($allPass) { 'PASS' } else { 'FAIL' } occ_reduction_pct = if ($rawTotal -gt 0) { [math]::Round((1.0 - ($occTotal / [double]$rawTotal)) * 100.0, 2) } else { 0 } gzip_reduction_pct = if ($rawTotal -gt 0) { [math]::Round((1.0 - ($gzipTotal / [double]$rawTotal)) * 100.0, 2) } else { 0 } } } # ---------------------------------------------------------------------- # Mode B: persistent shared dictionary. # Uses the first N files (or first N bytes of a single large file) as the # training set; builds one shared dictionary; encodes the remaining files # as body-only packets. # Wire bytes (sender side) = initial install package size + sum of packet sizes. # Cloud baseline = sum of raw file bytes. # ---------------------------------------------------------------------- function Run-Mode-B { param( [string]$setName, [System.IO.FileInfo[]]$files, [int]$trainCount, [int[]]$phraseLens ) Write-Host ("Mode B on $setName (train on first $trainCount of $($files.Count))...") # Training set $trainFiles = @($files | Select-Object -First $trainCount) $trainBytes = New-Object System.Collections.Generic.List[byte] foreach ($f in $trainFiles) { $b = [System.IO.File]::ReadAllBytes($f.FullName) foreach ($x in $b) { [void]$trainBytes.Add($x) } } $trainArr = $trainBytes.ToArray() $ranked = Build-Dictionary -inputBytes $trainArr -phraseLengths $phraseLens -maxEntries 512 $tier = Assign-Tiers -rankedEntries $ranked -maxTier1 16 -maxTier2 480 -maxTier3 0 # Serialize the dictionary alone for "installation" $dictBytes = Serialize-Dictionary -tierDict $tier $installBytes = $dictBytes.Length $allPass = $true $packetTotal = 0 $rawAll = 0 $i = 0 foreach ($f in $files) { $i++ $raw = [System.IO.File]::ReadAllBytes($f.FullName) $rawAll += $raw.Length $pkt = Build-Packet-BodyOnly -inputBytes $raw -tierDict $tier $packetTotal += $pkt.Length try { $rec = Decode-Packet -pkt $pkt -dict (Parse-Dictionary $dictBytes) if ((Get-Sha256Hex $rec) -ne (Get-Sha256Hex $raw)) { $allPass = $false } } catch { $allPass = $false } if ($i % 25 -eq 0 -or $i -eq $files.Count) { Write-Host (" [{0}/{1}] running totals raw={2} packets={3}" -f $i, $files.Count, $rawAll, $packetTotal) } } # For comparability we report cumulative wire bytes for the persistent # model = installBytes + sum(packet bytes). $persistentTotal = $installBytes + $packetTotal $saved = $rawAll - $persistentTotal $pct = if ($rawAll -gt 0) { [math]::Round(($saved / [double]$rawAll) * 100.0, 2) } else { 0 } return [pscustomobject]@{ file_set = $setName file_count = $files.Count train_file_count = $trainFiles.Count install_bytes = $installBytes raw_all_bytes = $rawAll persistent_total_bytes = $persistentTotal packets_only_bytes = $packetTotal bytes_saved = $saved percent_saved = $pct reconstruction_status = if ($allPass) { 'PASS' } else { 'FAIL' } } } # ---------------------------------------------------------------------- # Mode C: delta updates between consecutive versions in a set. # Cloud baseline = sum of raw bytes for files 2..N (full new-version downloads). # Delta sync = sum of delta packet sizes for files 2..N. # ---------------------------------------------------------------------- function Run-Mode-C { param([string]$setName, [System.IO.FileInfo[]]$files, [int[]]$phraseLens) Write-Host ("Mode C on $setName ($($files.Count) versions in order)...") if ($files.Count -lt 2) { return [pscustomobject]@{ file_set='n/a (need >=2 files)'; version_count=$files.Count; full_download_bytes=0; delta_sync_bytes=0; bytes_saved=0; percent_saved=0; reconstruction_status='SKIPPED' } } # Build a small dictionary from the first 2-3 files $trainBytes = New-Object System.Collections.Generic.List[byte] $trainTake = [Math]::Min(3, $files.Count) for ($k = 0; $k -lt $trainTake; $k++) { $b = [System.IO.File]::ReadAllBytes($files[$k].FullName) foreach ($x in $b) { [void]$trainBytes.Add($x) } } $ranked = Build-Dictionary -inputBytes $trainBytes.ToArray() -phraseLengths $phraseLens -maxEntries 256 $tier = Assign-Tiers -rankedEntries $ranked -maxTier1 16 -maxTier2 240 -maxTier3 0 $dictBytes = Serialize-Dictionary -tierDict $tier $prevBytes = [System.IO.File]::ReadAllBytes($files[0].FullName) $fullTotal = 0 $deltaTotal = 0 $allPass = $true for ($i = 1; $i -lt $files.Count; $i++) { $next = [System.IO.File]::ReadAllBytes($files[$i].FullName) $fullTotal += $next.Length $delta = Build-Delta -prev $prevBytes -next $next -tierDict $tier $deltaTotal += $delta.Length try { $rec = Apply-Delta -prev $prevBytes -delta $delta -tierDict (Parse-Dictionary $dictBytes) if ((Get-Sha256Hex $rec) -ne (Get-Sha256Hex $next)) { $allPass = $false } } catch { $allPass = $false } $prevBytes = $next } $saved = $fullTotal - $deltaTotal $pct = if ($fullTotal -gt 0) { [math]::Round(($saved / [double]$fullTotal) * 100.0, 2) } else { 0 } return [pscustomobject]@{ file_set = $setName version_count = $files.Count full_download_bytes = $fullTotal delta_sync_bytes = $deltaTotal install_dict_bytes = $dictBytes.Length bytes_saved = $saved percent_saved = $pct reconstruction_status = if ($allPass) { 'PASS' } else { 'FAIL' } } } # ---------------------------------------------------------------------- # Main # ---------------------------------------------------------------------- Write-Output 'OneCharacterCode V4 benchmark starting' Write-Output 'Inputs root: ' $inputsRoot Write-Output '' # Same phrase pool as V3 $shortPhrase = @(3,4,5,6,8,10,12,16,24,32,48,64,96,128) # For very large concatenated training sets we use a faster, shorter pool $fastPhrase = @(4,6,8,12,16,24,32,48,64) # Resolve sets function Get-Set([string]$name) { $d = Join-Path $inputsRoot $name if (-not (Test-Path $d)) { return @() } return @(Get-ChildItem -Path $d -File | Sort-Object Name) } $smallSet = Get-Set 'SMALL_CURRENT_SET' $sessionSet = Get-Set 'REPEATED_APP_SESSIONS' $pageSet = Get-Set 'WEBSITE_PAGE_SET' $logSet = Get-Set 'LOG_STYLE_SET' $agentSet = Get-Set 'AGENT_STATE_SET' Write-Output (" SMALL_CURRENT_SET: {0} files" -f $smallSet.Count) Write-Output (" REPEATED_APP_SESSIONS: {0} files" -f $sessionSet.Count) Write-Output (" WEBSITE_PAGE_SET: {0} files" -f $pageSet.Count) Write-Output (" LOG_STYLE_SET: {0} files" -f $logSet.Count) Write-Output (" AGENT_STATE_SET: {0} files" -f $agentSet.Count) Write-Output '' # ---------------------------------------------------------------------- # MODE A - Standalone per-file compression (cumulative) # ---------------------------------------------------------------------- Write-Output '=======================================================' Write-Output 'MODE A - Standalone per-file compression' Write-Output '=======================================================' $modeA = @() $modeA += (Run-Mode-A -setName 'SMALL_CURRENT_SET' -files $smallSet -phraseLens $shortPhrase) $modeA += (Run-Mode-A -setName 'REPEATED_APP_SESSIONS' -files $sessionSet -phraseLens $shortPhrase) $modeA += (Run-Mode-A -setName 'WEBSITE_PAGE_SET' -files $pageSet -phraseLens $shortPhrase) $modeA += (Run-Mode-A -setName 'AGENT_STATE_SET' -files $agentSet -phraseLens $shortPhrase) # Log set per-file would be one very large file - skip pinpoint per-file table for it. Write-Output '' # ---------------------------------------------------------------------- # MODE B - Persistent shared dictionary (install once, ship bodies) # ---------------------------------------------------------------------- Write-Output '=======================================================' Write-Output 'MODE B - Persistent shared dictionary' Write-Output '=======================================================' $modeB = @() $modeB += (Run-Mode-B -setName 'REPEATED_APP_SESSIONS' -files $sessionSet -trainCount 10 -phraseLens $shortPhrase) $modeB += (Run-Mode-B -setName 'WEBSITE_PAGE_SET' -files $pageSet -trainCount 5 -phraseLens $shortPhrase) $modeB += (Run-Mode-B -setName 'AGENT_STATE_SET' -files $agentSet -trainCount 10 -phraseLens $shortPhrase) Write-Output '' # ---------------------------------------------------------------------- # MODE C - Delta updates between consecutive versions # ---------------------------------------------------------------------- Write-Output '=======================================================' Write-Output 'MODE C - Delta updates' Write-Output '=======================================================' $modeC = @() $modeC += (Run-Mode-C -setName 'REPEATED_APP_SESSIONS' -files $sessionSet -phraseLens $shortPhrase) $modeC += (Run-Mode-C -setName 'WEBSITE_PAGE_SET' -files $pageSet -phraseLens $shortPhrase) $modeC += (Run-Mode-C -setName 'AGENT_STATE_SET' -files $agentSet -phraseLens $shortPhrase) Write-Output '' # ---------------------------------------------------------------------- # Write output JSON + text reports # ---------------------------------------------------------------------- $now = (Get-Date).ToString('o') $machine = $env:COMPUTERNAME $psv = $PSVersionTable.PSVersion.ToString() $encUtf8NoBom = New-Object System.Text.UTF8Encoding $false # benchmark-results-v4.json (Mode A) $pkgA = [pscustomobject]@{ schema_version = 'occ-v4-standalone' generated_at = $now machine_name = $machine ps_version = $psv description = 'V4 Mode A: standalone per-file compression. Cumulative bytes per file set: raw, gzip(raw), OCC V4. Round-trip required for any win.' results = $modeA } [System.IO.File]::WriteAllText((Join-Path $scriptDir 'benchmark-results-v4.json'), ($pkgA | ConvertTo-Json -Depth 10), $encUtf8NoBom) # system-persistent-results-v4.json (Mode B) $pkgB = [pscustomobject]@{ schema_version = 'occ-v4-persistent-dict' generated_at = $now machine_name = $machine ps_version = $psv description = 'V4 Mode B: persistent shared dictionary. The first N files are training. The dictionary is installed once. Subsequent files are body-only packets. Cumulative wire bytes = install + packets.' disclosure = 'This is system-level data savings, not file compression. After install, the device reuses the dictionary across all later transfers.' results = $modeB } [System.IO.File]::WriteAllText((Join-Path $scriptDir 'system-persistent-results-v4.json'), ($pkgB | ConvertTo-Json -Depth 10), $encUtf8NoBom) # delta-update-results-v4.json (Mode C) $pkgC = [pscustomobject]@{ schema_version = 'occ-v4-delta-update' generated_at = $now machine_name = $machine ps_version = $psv description = 'V4 Mode C: delta updates between consecutive versions. The receiver already has the previous version; only changed-middle bytes are sent inside an OCC carrier. Cumulative wire bytes for versions 2..N.' disclosure = 'This is delta-sync system bandwidth, not file compression. The receiver must have the previous version locally.' results = $modeC } [System.IO.File]::WriteAllText((Join-Path $scriptDir 'delta-update-results-v4.json'), ($pkgC | ConvertTo-Json -Depth 10), $encUtf8NoBom) # Text run report $txt = New-Object System.Text.StringBuilder [void]$txt.AppendLine('OneCharacterCode V4 benchmark - run report') [void]$txt.AppendLine("Generated: $now") [void]$txt.AppendLine("Machine: $machine PowerShell: $psv") [void]$txt.AppendLine('') [void]$txt.AppendLine('MODE A - STANDALONE PER-FILE COMPRESSION') [void]$txt.AppendLine('----------------------------------------') [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,12} {6,11}" -f 'Set','Files','RawBytes','GzipBytes','OCC V4','Winner','Recon')) foreach ($r in $modeA) { [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,12} {6,11}" -f $r.file_set, $r.file_count, $r.raw_bytes, $r.gzip_raw_bytes, $r.occ_v4_bytes, $r.winner, $r.reconstruction_status)) } [void]$txt.AppendLine('') [void]$txt.AppendLine('MODE B - PERSISTENT SHARED DICTIONARY') [void]$txt.AppendLine('-------------------------------------') [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,8} {6,11}" -f 'Set','Files','RawAllBytes','PersistTotal','BytesSaved','Saved%','Recon')) foreach ($r in $modeB) { [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,7}% {6,11}" -f $r.file_set, $r.file_count, $r.raw_all_bytes, $r.persistent_total_bytes, $r.bytes_saved, $r.percent_saved, $r.reconstruction_status)) } [void]$txt.AppendLine('') [void]$txt.AppendLine('MODE C - DELTA UPDATES') [void]$txt.AppendLine('----------------------') [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,8} {6,11}" -f 'Set','Versions','FullDownload','DeltaSync','BytesSaved','Saved%','Recon')) foreach ($r in $modeC) { [void]$txt.AppendLine(("{0,-26} {1,7} {2,12} {3,12} {4,12} {5,7}% {6,11}" -f $r.file_set, $r.version_count, $r.full_download_bytes, $r.delta_sync_bytes, $r.bytes_saved, $r.percent_saved, $r.reconstruction_status)) } [void]$txt.AppendLine('') [void]$txt.AppendLine('End of report.') [System.IO.File]::WriteAllText((Join-Path $scriptDir 'benchmark-test-run-v4.txt'), $txt.ToString(), $encUtf8NoBom) Write-Output 'Wrote: benchmark-results-v4.json' Write-Output 'Wrote: system-persistent-results-v4.json' Write-Output 'Wrote: delta-update-results-v4.json' Write-Output 'Wrote: benchmark-test-run-v4.txt' Write-Output '' Write-Output 'Done.'