# run_benchmark_v3.ps1 # OneCharacterCode V3 prototype benchmark - PowerShell 5.1 compatible. # Two separate tests: # 1. File compression with the V3 symbolic encoder (3-tier tokens, # iterative refinement, per-file adaptive mode). # 2. System-level bandwidth simulation comparing cloud-full-download vs # local-install-plus-update-packet at session counts 1/5/10/30/100. # These are two DIFFERENT claims. The script keeps them in separate files. $ErrorActionPreference = 'Stop' $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path $inputsDir = Join-Path $scriptDir 'inputs' $outputsDir = Join-Path $scriptDir 'outputs' if (-not (Test-Path $outputsDir)) { New-Item -ItemType Directory -Path $outputsDir | Out-Null } # ---------------------------------------------------------------------- # Reserved-byte table. # ---------------------------------------------------------------------- # Tier 1: 16 single-byte tokens, choosing bytes that avoid TAB/LF/CR/NUL $TIER1_BYTES = @(0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08, 0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x12,0x13) # Tier 2: ESC 0x14 + index (0..255) -> 256, ESC 0x15 + index (0..255) -> 256 $TIER2_ESC_A = 0x14 $TIER2_ESC_B = 0x15 # Tier 3: ESC 0x16 + 2-byte LE index (0..65535) $TIER3_ESC = 0x16 # Literal escape: ESC 0x17 + byte $LITERAL_ESC = 0x17 # Bytes that must be escaped if they appear in the source bytestream: $RESERVED = New-Object 'System.Collections.Generic.HashSet[byte]' foreach ($b in $TIER1_BYTES) { [void]$RESERVED.Add([byte]$b) } [void]$RESERVED.Add([byte]$TIER2_ESC_A) [void]$RESERVED.Add([byte]$TIER2_ESC_B) [void]$RESERVED.Add([byte]$TIER3_ESC) [void]$RESERVED.Add([byte]$LITERAL_ESC) # Reserved as placeholders during in-memory scan (never written to file): $PLACEHOLDER_BASE = 0xF0 # placeholders use bytes 0xF0..0xFF and beyond -- but we operate on a String of UTF-16 code units, not bytes, so collision-free placeholders are fine # ---------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------- function Get-Sha256Hex([byte[]]$bytes) { $sha = [System.Security.Cryptography.SHA256]::Create() try { ($sha.ComputeHash($bytes) | ForEach-Object { $_.ToString('x2') }) -join '' } finally { $sha.Dispose() } } function Gzip-Bytes([byte[]]$bytes) { $ms = New-Object System.IO.MemoryStream $gz = New-Object System.IO.Compression.GZipStream($ms, [System.IO.Compression.CompressionMode]::Compress) $gz.Write($bytes, 0, $bytes.Length) $gz.Close() $out = $ms.ToArray() $ms.Dispose() return $out } function Try-Brotli-Bytes([byte[]]$bytes) { try { $type = [System.Type]::GetType('System.IO.Compression.BrotliStream, System.IO.Compression.Brotli', $false) if ($null -eq $type) { return $null } $ms = New-Object System.IO.MemoryStream $br = $type::new($ms, [System.IO.Compression.CompressionMode]::Compress) $br.Write($bytes, 0, $bytes.Length) $br.Close() $out = $ms.ToArray() $ms.Dispose() return $out } catch { return $null } } # ---------------------------------------------------------------------- # V3 ENCODER # ---------------------------------------------------------------------- # Strategy: # Work on the input as a UTF-8 byte sequence -> ANSI-mapped string for # fast substring counting. We use raw byte values 0..255 mapped 1:1 to # UTF-16 code points 0..255 via the ISO-8859-1 codepage so every byte # becomes exactly one .NET char and substring operations are cheap. # This avoids any UTF-8 multi-byte ambiguity. # Scan candidate substrings at multiple lengths, rank by net savings, # greedy accept with overlap rejection, RESCAN after each accept. # Final tier assignment by descending net savings. function Bytes-To-Latin1String([byte[]]$bytes) { $enc = [System.Text.Encoding]::GetEncoding(28591) # ISO-8859-1 / Latin-1 return $enc.GetString($bytes) } function Latin1String-To-Bytes([string]$s) { $enc = [System.Text.Encoding]::GetEncoding(28591) return $enc.GetBytes($s) } # Reserved bytes -> Latin1 chars (for placeholder substitution during scan) # We use private high-byte placeholders 0xE0..0xEF for picked-entry markers. # 16 distinct placeholders is enough because we only need a placeholder # that won't collide with original input bytes or other reserved bytes. # We'll use UTF-16 codepoints in the private use area (0xE000..) which # CANNOT appear in a Latin-1 string -> guaranteed collision-free. $PLACEHOLDER_FIRST = 0xE000 # PUA, will never appear in Latin1-decoded bytes function Count-Occurrences([string]$haystack, [string]$needle) { if ([string]::IsNullOrEmpty($needle)) { return 0 } $count = 0 $idx = 0 $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { break } $count++ $idx = $found + $nlen } return $count } function Replace-All-NonOverlap([string]$haystack, [string]$needle, [string]$replacement) { if ([string]::IsNullOrEmpty($needle)) { return $haystack } $sb = New-Object System.Text.StringBuilder $idx = 0 $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { [void]$sb.Append($haystack.Substring($idx)) break } if ($found -gt $idx) { [void]$sb.Append($haystack.Substring($idx, $found - $idx)) } [void]$sb.Append($replacement) $idx = $found + $nlen } return $sb.ToString() } function Encode-V3 { param( [byte[]]$inputBytes, [int[]]$phraseLengths, [int]$maxTier1, [int]$maxTier2, [int]$maxTier3 ) # Returns hashtable: @{ entries=[List of @{Text=...; Count=...; Tier=...}]; encodedBytes=...; mode=... } $totalSlots = $maxTier1 + $maxTier2 + $maxTier3 $working = Bytes-To-Latin1String $inputBytes $originalLen = $working.Length # Track accepted entries as a list of strings (the substring text in Latin1 form) $accepted = New-Object 'System.Collections.Generic.List[string]' $placeholderIdx = 0 $maxIterations = $totalSlots + 4 for ($iter = 0; $iter -lt $maxIterations; $iter++) { if ($accepted.Count -ge $totalSlots) { break } # Tentative token cost for ranking: assume 2 bytes (mid-tier), # we will re-rank to assign actual tiers later. This is a heuristic # but is monotonic with savings, so the ordering is preserved well # enough for greedy selection. $tentativeTokenCost = 2 $bestText = $null $bestNet = 0 $bestLen = 0 $bestCount = 0 foreach ($L in $phraseLengths) { if ($L -lt 3) { continue } if ($working.Length -lt ($L * 2)) { continue } # Build counts of all L-length substrings, but only if they appear # at least twice. Use Dictionary for speed. $counts = New-Object 'System.Collections.Generic.Dictionary[string,int]' $upper = $working.Length - $L for ($i = 0; $i -le $upper; $i++) { $sub = $working.Substring($i, $L) # Skip substrings that contain a placeholder char (PUA) $hasPlaceholder = $false for ($k = 0; $k -lt $L; $k++) { if ([int]$sub[$k] -ge 0xE000) { $hasPlaceholder = $true; break } } if ($hasPlaceholder) { continue } if ($counts.ContainsKey($sub)) { $counts[$sub] = $counts[$sub] + 1 } else { $counts[$sub] = 1 } } foreach ($kv in $counts.GetEnumerator()) { if ($kv.Value -lt 2) { continue } $text = $kv.Key $count = $kv.Value # Per-entry net gain: # savings_per_use = L - tokenCost # dict_cost = 1 (length prefix) + L (entry bytes) # net = (savings_per_use * count) - dict_cost $net = (($L - $tentativeTokenCost) * $count) - (1 + $L) if ($net -gt $bestNet) { $bestNet = $net $bestText = $text $bestLen = $L $bestCount = $count } } } if ($null -eq $bestText) { break } if ($bestNet -le 0) { break } # Accept: replace all non-overlapping occurrences with a unique placeholder char if ($placeholderIdx -gt 0x1FFF) { break } # ran out of placeholder code points $phChar = [char]($PLACEHOLDER_FIRST + $placeholderIdx) $placeholderIdx++ $accepted.Add($bestText) $working = Replace-All-NonOverlap $working $bestText ([string]$phChar) } # ------------------------------------------------------------------ # Assign tiers. Re-rank accepted entries by actual savings. # We need actual occurrence counts at each entry's original definition. # Easiest: re-scan the ORIGINAL input greedily in the same order to # measure each entry's count in context, but actually the count we # recorded above already equals the count IN the working text at the # moment of acceptance, which IS the count we will use. # # To keep this simple and provably-honest, re-encode straight from the # original input bytes. For each accepted entry text, the count of # non-overlapping occurrences relative to the order it was picked is # the count we use. We can recompute by replaying: replace entry 0, # then entry 1, etc., counting each replacement. # ------------------------------------------------------------------ $replay = Bytes-To-Latin1String $inputBytes $entryInfo = New-Object 'System.Collections.Generic.List[object]' $placeholderIdx2 = 0 foreach ($text in $accepted) { $count = Count-Occurrences $replay $text $L = $text.Length $phChar = [char]($PLACEHOLDER_FIRST + $placeholderIdx2) $placeholderIdx2++ $replay = Replace-All-NonOverlap $replay $text ([string]$phChar) $entryInfo.Add(@{ Text = $text Length = $L Count = $count Placeholder = $phChar }) } # Rank by raw bytes saved (length * count) descending so longest+commonest # get the cheapest tokens. $ranked = $entryInfo | Sort-Object -Descending { $_.Length * $_.Count } $rankedArr = @($ranked) for ($r = 0; $r -lt $rankedArr.Count; $r++) { $tier = 3 if ($r -lt $maxTier1) { $tier = 1 } elseif ($r -lt ($maxTier1 + $maxTier2)) { $tier = 2 } elseif ($r -lt ($maxTier1 + $maxTier2 + $maxTier3)) { $tier = 3 } else { $tier = 0 } # not encoded, falls through as literal $rankedArr[$r].Tier = $tier $rankedArr[$r].TierIndex = $r - $maxTier1 # for tier 2/3 index calc if ($tier -eq 1) { $rankedArr[$r].TokenBytes = @([byte]$TIER1_BYTES[$r]) } elseif ($tier -eq 2) { $t2idx = $r - $maxTier1 if ($t2idx -lt 256) { $rankedArr[$r].TokenBytes = @([byte]$TIER2_ESC_A, [byte]$t2idx) } else { $rankedArr[$r].TokenBytes = @([byte]$TIER2_ESC_B, [byte]($t2idx - 256)) } } elseif ($tier -eq 3) { $t3idx = $r - $maxTier1 - $maxTier2 $lo = [byte]($t3idx -band 0xFF) $hi = [byte](($t3idx -shr 8) -band 0xFF) $rankedArr[$r].TokenBytes = @([byte]$TIER3_ESC, $lo, $hi) } else { $rankedArr[$r].TokenBytes = $null # entries beyond all tiers - skip } } # Build final body bytes by re-replaying replacements onto original input. # We have to do this in TIER ORDER (most-favored first) so that the same # priority used at acceptance time is preserved. $bodyStr = Bytes-To-Latin1String $inputBytes $tokenLookup = @{} # placeholder char -> token bytes foreach ($e in $rankedArr) { if ($null -eq $e.TokenBytes) { continue } $bodyStr = Replace-All-NonOverlap $bodyStr $e.Text ([string]$e.Placeholder) $tokenLookup[$e.Placeholder] = $e.TokenBytes } # Emit body bytes: walk $bodyStr char-by-char, substituting tokens or # escaping reserved bytes. $bodyOut = New-Object System.Collections.Generic.List[byte] for ($i = 0; $i -lt $bodyStr.Length; $i++) { $ch = $bodyStr[$i] $code = [int]$ch if ($code -ge $PLACEHOLDER_FIRST) { $tb = $tokenLookup[$ch] foreach ($b in $tb) { [void]$bodyOut.Add([byte]$b) } } else { $b = [byte]$code if ($RESERVED.Contains($b)) { [void]$bodyOut.Add([byte]$LITERAL_ESC) [void]$bodyOut.Add($b) } else { [void]$bodyOut.Add($b) } } } # Build dictionary header: tier1 entries (in order 0..n-1), tier2, tier3. # Each entry: 1 byte length + entry bytes (raw, NOT escaped because the # decoder reads them by length). $t1 = @($rankedArr | Where-Object { $_.Tier -eq 1 }) $t2 = @($rankedArr | Where-Object { $_.Tier -eq 2 }) $t3 = @($rankedArr | Where-Object { $_.Tier -eq 3 }) $hdr = New-Object System.Collections.Generic.List[byte] # Magic foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC3'))) { [void]$hdr.Add($c) } # Tier1 count (1 byte) + entries [void]$hdr.Add([byte]$t1.Count) foreach ($e in $t1) { $b = Latin1String-To-Bytes $e.Text if ($b.Length -gt 255) { throw "Entry too long" } [void]$hdr.Add([byte]$b.Length) foreach ($x in $b) { [void]$hdr.Add($x) } } # Tier2 count (2 bytes LE) + entries [void]$hdr.Add([byte]($t2.Count -band 0xFF)) [void]$hdr.Add([byte](($t2.Count -shr 8) -band 0xFF)) foreach ($e in $t2) { $b = Latin1String-To-Bytes $e.Text [void]$hdr.Add([byte]$b.Length) foreach ($x in $b) { [void]$hdr.Add($x) } } # Tier3 count (2 bytes LE) + entries [void]$hdr.Add([byte]($t3.Count -band 0xFF)) [void]$hdr.Add([byte](($t3.Count -shr 8) -band 0xFF)) foreach ($e in $t3) { $b = Latin1String-To-Bytes $e.Text [void]$hdr.Add([byte]$b.Length) foreach ($x in $b) { [void]$hdr.Add($x) } } # Body length (4 bytes LE) $bl = $bodyOut.Count [void]$hdr.Add([byte]($bl -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 8) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 16) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 24) -band 0xFF)) # Append body foreach ($x in $bodyOut) { [void]$hdr.Add($x) } return @{ encodedBytes = $hdr.ToArray() tier1Count = $t1.Count tier2Count = $t2.Count tier3Count = $t3.Count entryCount = $rankedArr.Count } } # ---------------------------------------------------------------------- # V3 DECODER # ---------------------------------------------------------------------- function Decode-V3([byte[]]$enc) { $i = 0 if ($enc.Length -lt 4) { throw "encoded too short" } $magic = [System.Text.Encoding]::ASCII.GetString($enc, 0, 4) if ($magic -ne 'OCC3') { throw "bad magic: $magic" } $i = 4 $tier1 = New-Object System.Collections.Generic.List[byte[]] $tier2 = New-Object System.Collections.Generic.List[byte[]] $tier3 = New-Object System.Collections.Generic.List[byte[]] $t1c = $enc[$i]; $i++ for ($k = 0; $k -lt $t1c; $k++) { $len = $enc[$i]; $i++ $b = New-Object byte[] $len [Array]::Copy($enc, $i, $b, 0, $len) $tier1.Add($b) $i += $len } $t2c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t2c; $k++) { $len = $enc[$i]; $i++ $b = New-Object byte[] $len [Array]::Copy($enc, $i, $b, 0, $len) $tier2.Add($b) $i += $len } $t3c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t3c; $k++) { $len = $enc[$i]; $i++ $b = New-Object byte[] $len [Array]::Copy($enc, $i, $b, 0, $len) $tier3.Add($b) $i += $len } $bl = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8) -bor ([int]$enc[$i+2] -shl 16) -bor ([int]$enc[$i+3] -shl 24); $i += 4 $bodyStart = $i $bodyEnd = $i + $bl $out = New-Object System.Collections.Generic.List[byte] $j = $bodyStart while ($j -lt $bodyEnd) { $b = $enc[$j] # Tier 1? $isT1 = $false for ($t = 0; $t -lt $TIER1_BYTES.Count; $t++) { if ($b -eq [byte]$TIER1_BYTES[$t]) { if ($t -lt $tier1.Count) { foreach ($x in $tier1[$t]) { [void]$out.Add($x) } $isT1 = $true } break } } if ($isT1) { $j++; continue } if ($b -eq $TIER2_ESC_A) { $idx = [int]$enc[$j+1] if ($idx -lt $tier2.Count) { foreach ($x in $tier2[$idx]) { [void]$out.Add($x) } } $j += 2; continue } if ($b -eq $TIER2_ESC_B) { $idx = [int]$enc[$j+1] + 256 if ($idx -lt $tier2.Count) { foreach ($x in $tier2[$idx]) { [void]$out.Add($x) } } $j += 2; continue } if ($b -eq $TIER3_ESC) { $idx = [int]$enc[$j+1] -bor ([int]$enc[$j+2] -shl 8) if ($idx -lt $tier3.Count) { foreach ($x in $tier3[$idx]) { [void]$out.Add($x) } } $j += 3; continue } if ($b -eq $LITERAL_ESC) { [void]$out.Add($enc[$j+1]) $j += 2; continue } [void]$out.Add($b) $j++ } return $out.ToArray() } # ---------------------------------------------------------------------- # Per-file run # ---------------------------------------------------------------------- function Run-V3-OnFile([string]$path) { $name = Split-Path -Leaf $path $rawBytes = [System.IO.File]::ReadAllBytes($path) $rawSha = Get-Sha256Hex $rawBytes $gzipRaw = Gzip-Bytes $rawBytes $brRaw = Try-Brotli-Bytes $rawBytes # likely $null on PS 5.1 # Two adaptive modes: # Mode A: dict-only (tier1+tier2 only, no tier3) -- often smaller for short files # Mode B: full hybrid (all three tiers) $phraseLens = @(3,4,5,6,8,10,12,16,24,32,48,64,96,128) $modeA = Encode-V3 -inputBytes $rawBytes -phraseLengths $phraseLens -maxTier1 16 -maxTier2 512 -maxTier3 0 $modeB = Encode-V3 -inputBytes $rawBytes -phraseLengths $phraseLens -maxTier1 16 -maxTier2 512 -maxTier3 256 # Validate both $okA = $true; $okB = $true $shaA = ''; $shaB = '' try { $decA = Decode-V3 $modeA.encodedBytes $shaA = Get-Sha256Hex $decA $okA = ($shaA -eq $rawSha) } catch { $okA = $false } try { $decB = Decode-V3 $modeB.encodedBytes $shaB = Get-Sha256Hex $decB $okB = ($shaB -eq $rawSha) } catch { $okB = $false } # Pick smaller with PASS $best = $null if ($okA -and $okB) { if ($modeA.encodedBytes.Length -le $modeB.encodedBytes.Length) { $best = @{ tag='dict-only'; data=$modeA; bytes=$modeA.encodedBytes; sha=$shaA } } else { $best = @{ tag='hybrid-3tier'; data=$modeB; bytes=$modeB.encodedBytes; sha=$shaB } } } elseif ($okA) { $best = @{ tag='dict-only'; data=$modeA; bytes=$modeA.encodedBytes; sha=$shaA } } elseif ($okB) { $best = @{ tag='hybrid-3tier'; data=$modeB; bytes=$modeB.encodedBytes; sha=$shaB } } else { $best = @{ tag='none-pass'; data=$modeB; bytes=$modeB.encodedBytes; sha=$shaB } } # gzip(OCC V3) $gzipOcc = Gzip-Bytes $best.bytes # Write byproducts $base = Join-Path $outputsDir $name [System.IO.File]::WriteAllBytes("$base.gz", $gzipRaw) if ($null -ne $brRaw) { [System.IO.File]::WriteAllBytes("$base.br", $brRaw) } [System.IO.File]::WriteAllBytes("$base.occ3", $best.bytes) [System.IO.File]::WriteAllBytes("$base.occ3.gz", $gzipOcc) try { $reconstructed = Decode-V3 $best.bytes [System.IO.File]::WriteAllBytes("$base.occ3.reconstructed", $reconstructed) } catch {} $passOk = ($best.sha -eq $rawSha) $rawLen = $rawBytes.Length $occLen = $best.bytes.Length $redPct = if ($rawLen -gt 0) { [math]::Round((1.0 - ($occLen / [double]$rawLen)) * 100.0, 2) } else { 0 } return @{ input_file = $name raw_bytes = $rawLen gzip_raw_bytes = $gzipRaw.Length brotli_raw_bytes = if ($null -ne $brRaw) { $brRaw.Length } else { $null } occ_v3_bytes = $occLen occ_v3_mode = $best.tag gzip_occ_v3_bytes = $gzipOcc.Length occ_v3_reduction_pct = $redPct occ_v3_tier1_count = $best.data.tier1Count occ_v3_tier2_count = $best.data.tier2Count occ_v3_tier3_count = $best.data.tier3Count occ_v3_entry_count = $best.data.entryCount reconstruction_status = if ($passOk) { 'PASS' } else { 'FAIL' } raw_sha256 = $rawSha reconstructed_sha256 = $best.sha mode_a_bytes_dict_only = $modeA.encodedBytes.Length mode_a_pass = $okA mode_b_bytes_hybrid_3t = $modeB.encodedBytes.Length mode_b_pass = $okB } } # ---------------------------------------------------------------------- # Cross-reference V1 / V2 snapshots # ---------------------------------------------------------------------- function Load-Snapshot([string]$path) { if (-not (Test-Path $path)) { return $null } try { return Get-Content -Raw -Path $path | ConvertFrom-Json } catch { return $null } } # ---------------------------------------------------------------------- # Main: file compression test # ---------------------------------------------------------------------- Write-Output "OneCharacterCode V3 benchmark -- starting" Write-Output "Working dir: $scriptDir" Write-Output "" $inputs = Get-ChildItem -Path $inputsDir -File | Sort-Object Name if ($inputs.Count -eq 0) { throw "No inputs found in $inputsDir" } $v1snap = Load-Snapshot (Join-Path $scriptDir 'benchmark-results-v1.json') $v2snap = Load-Snapshot (Join-Path $scriptDir 'benchmark-results-v2.json') $results = @() foreach ($f in $inputs) { Write-Output ("Encoding {0} ..." -f $f.Name) $r = Run-V3-OnFile $f.FullName # Cross-link V1 + V2 numbers $v1Bytes = $null $v2Bytes = $null if ($null -ne $v1snap) { foreach ($x in $v1snap.results) { if ($x.input_file -eq $f.Name) { if ($null -ne $x.occ_symbolic_bytes) { $v1Bytes = $x.occ_symbolic_bytes } elseif ($null -ne $x.occ_bytes) { $v1Bytes = $x.occ_bytes } } } } if ($null -ne $v2snap) { foreach ($x in $v2snap.results) { if ($x.input_file -eq $f.Name) { $v2Bytes = $x.occ_v2_bytes } } } $r['occ_v1_bytes'] = $v1Bytes $r['occ_v2_bytes'] = $v2Bytes if ($null -ne $v1Bytes -and $v1Bytes -gt 0) { $r['v3_vs_v1_reduction_pct'] = [math]::Round((1.0 - ($r.occ_v3_bytes / [double]$v1Bytes)) * 100.0, 2) } else { $r['v3_vs_v1_reduction_pct'] = $null } if ($null -ne $v2Bytes -and $v2Bytes -gt 0) { $r['v3_vs_v2_reduction_pct'] = [math]::Round((1.0 - ($r.occ_v3_bytes / [double]$v2Bytes)) * 100.0, 2) } else { $r['v3_vs_v2_reduction_pct'] = $null } $results += [pscustomobject]$r Write-Output (" raw={0} bytes gzip={1} v3={2} reduction={3}% mode={4} recon={5}" -f $r.raw_bytes, $r.gzip_raw_bytes, $r.occ_v3_bytes, $r.occ_v3_reduction_pct, $r.occ_v3_mode, $r.reconstruction_status) } # ---------------------------------------------------------------------- # System-level bandwidth simulation # ---------------------------------------------------------------------- Write-Output "" Write-Output "System-level bandwidth simulation" Write-Output "---------------------------------" Write-Output "(NOT file compression. Simulates session-level cloud-full-download" Write-Output " vs. local-install-once-plus-small-update-packet sync model.)" Write-Output "" $initialBytes = 0 foreach ($f in $inputs) { $initialBytes += (Get-Item $f.FullName).Length } $updateBytes = [int][math]::Ceiling($initialBytes * 0.05) # simulated 5% delta per session $sessionCounts = @(1, 5, 10, 30, 100) $bwResults = @() foreach ($N in $sessionCounts) { $cloud = [int64]$N * [int64]$initialBytes if ($N -ge 1) { $local = [int64]$initialBytes + ([int64]([Math]::Max(0, $N - 1)) * [int64]$updateBytes) } else { $local = 0 } $saved = $cloud - $local $pct = if ($cloud -gt 0) { [math]::Round(($saved / [double]$cloud) * 100.0, 2) } else { 0 } $bwResults += [pscustomobject]@{ sessions = $N initial_package_bytes = $initialBytes simulated_update_bytes = $updateBytes cloud_full_download_bytes= $cloud local_install_plus_updates_bytes = $local bytes_saved = $saved savings_pct = $pct } Write-Output (" sessions={0,3} cloud={1,12} bytes local={2,12} bytes saved={3,12} bytes ({4}%)" -f $N, $cloud, $local, $saved, $pct) } # ---------------------------------------------------------------------- # Write outputs # ---------------------------------------------------------------------- $now = (Get-Date).ToString('o') $machine = $env:COMPUTERNAME $psv = $PSVersionTable.PSVersion.ToString() $packet = [pscustomobject]@{ schema_version = 'occ-bench-v3' generated_at = $now machine_name = $machine ps_version = $psv encoder_notes = @( 'Three-tier symbolic tokens (1-byte / 2-byte / 3-byte)', 'Tier 1: 16 single-byte tokens (avoiding TAB/LF/CR/NUL)', 'Tier 2: 512 two-byte tokens (ESC 0x14 + idx, ESC 0x15 + idx)', 'Tier 3: up to 256 three-byte tokens (ESC 0x16 + 2-byte index)', 'Iterative refinement: rescan working text after each accept', 'Per-file adaptive: dict-only vs hybrid-3tier, pick smaller with PASS', 'Compact header: 1-byte length prefixes per entry', 'Phrase length pool: 3,4,5,6,8,10,12,16,24,32,48,64,96,128', 'Reserved-byte escape 0x17 for literal pass-through', 'SHA-256 round-trip required for any number to count' ) results = $results } $packetJson = $packet | ConvertTo-Json -Depth 10 [System.IO.File]::WriteAllText((Join-Path $scriptDir 'benchmark-results-v3.json'), $packetJson, [System.Text.Encoding]::UTF8) $bwPacket = [pscustomobject]@{ schema_version = 'occ-system-bandwidth-v3' generated_at = $now machine_name = $machine ps_version = $psv disclosure = @( 'This is NOT file compression. This is a session-level bandwidth model.', 'It compares (a) every session pulling the full package from the cloud', 'against (b) installing the package once locally and pulling only small', 'update packets after that.', 'The 5% update-packet figure is a SIMULATED placeholder, not measured.', 'The savings percentage is therefore a model prediction, not an', 'empirical compression result.' ) inputs = @(foreach ($f in $inputs) { [pscustomobject]@{ file = $f.Name; bytes = (Get-Item $f.FullName).Length } }) initial_package_bytes = $initialBytes simulated_update_bytes = $updateBytes simulated_update_ratio_of_initial_pct = 5.0 rows = $bwResults } [System.IO.File]::WriteAllText((Join-Path $scriptDir 'system-level-bandwidth-results-v3.json'), ($bwPacket | ConvertTo-Json -Depth 10), [System.Text.Encoding]::UTF8) # Human-readable v3 run report $txt = New-Object System.Text.StringBuilder [void]$txt.AppendLine("OneCharacterCode V3 prototype - run report") [void]$txt.AppendLine("Generated: $now") [void]$txt.AppendLine("Machine: $machine PowerShell: $psv") [void]$txt.AppendLine("") [void]$txt.AppendLine("PART 1 - FILE COMPRESSION (V3 prototype encoder)") [void]$txt.AppendLine("-------------------------------------------------") [void]$txt.AppendLine("") [void]$txt.AppendLine(("{0,-32} {1,10} {2,10} {3,10} {4,10} {5,10} {6,10} {7,10} {8,7} {9,11}" -f 'File','Raw','Gzip','OCC V1','OCC V2','OCC V3','GzOCCV3','V3red%','Recon','Mode')) foreach ($r in $results) { $v1 = if ($null -ne $r.occ_v1_bytes) { $r.occ_v1_bytes } else { 0 } $v2 = if ($null -ne $r.occ_v2_bytes) { $r.occ_v2_bytes } else { 0 } [void]$txt.AppendLine(("{0,-32} {1,10} {2,10} {3,10} {4,10} {5,10} {6,10} {7,10}% {8,7} {9,11}" -f $r.input_file, $r.raw_bytes, $r.gzip_raw_bytes, $v1, $v2, $r.occ_v3_bytes, $r.gzip_occ_v3_bytes, $r.occ_v3_reduction_pct, $r.reconstruction_status, $r.occ_v3_mode)) } [void]$txt.AppendLine("") [void]$txt.AppendLine("PART 2 - SYSTEM-LEVEL BANDWIDTH SIMULATION") [void]$txt.AppendLine("-------------------------------------------") [void]$txt.AppendLine("Initial package = sum of inputs = $initialBytes bytes") [void]$txt.AppendLine("Simulated update packet (5% of initial) = $updateBytes bytes") [void]$txt.AppendLine("This is NOT a compression measurement. It models a sync strategy.") [void]$txt.AppendLine("") [void]$txt.AppendLine(("{0,8} {1,15} {2,15} {3,15} {4,10}" -f 'Sessions','CloudFull','LocalSync','Saved','Saved%')) foreach ($b in $bwResults) { [void]$txt.AppendLine(("{0,8} {1,15} {2,15} {3,15} {4,9}%" -f $b.sessions, $b.cloud_full_download_bytes, $b.local_install_plus_updates_bytes, $b.bytes_saved, $b.savings_pct)) } [void]$txt.AppendLine("") [void]$txt.AppendLine("End of report.") [System.IO.File]::WriteAllText((Join-Path $scriptDir 'benchmark-test-run-v3.txt'), $txt.ToString(), [System.Text.Encoding]::UTF8) Write-Output "" Write-Output "Wrote: benchmark-results-v3.json" Write-Output "Wrote: system-level-bandwidth-results-v3.json" Write-Output "Wrote: benchmark-test-run-v3.txt" Write-Output "Per-file byproducts in: $outputsDir" Write-Output "" Write-Output "Done."