# run_stacked_compression_test.ps1 # OneCharacterCode V3 STACKED COMPRESSION TEST - PowerShell 5.1 compatible. # # Correction from Bret Fencl: # If OCC data is transmitted, it can still be compressed again with gzip. # The fair transport comparison is gzip(raw) vs gzip(OCC carrier). # # Pipeline tested per input file: # raw bytes # -> gzip(raw) [baseline transport size] # raw bytes # -> OCC V3 encoder # -> gzip(OCC V3 carrier) [stacked transport size] # -> gunzip [recover OCC V3 carrier] # -> OCC V3 decoder [recover raw] # -> SHA-256 match check [must equal SHA-256 of original] # # Winner per file: # gzip(raw) bytes vs gzip(OCC V3) bytes -- smaller wins. # If gzip(raw) is smaller or equal, this script reports that gzip(raw) wins. # # No fake wins. The encoder is dot-sourced from run_benchmark_v3.ps1 in this # folder so we exercise the exact same V3 encoder that produced the public # V3 numbers. $ErrorActionPreference = 'Stop' $scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path $inputsDir = Join-Path $scriptDir 'inputs' $outputsDir = Join-Path $scriptDir 'outputs' if (-not (Test-Path $outputsDir)) { New-Item -ItemType Directory -Path $outputsDir | Out-Null } # ---------------------------------------------------------------------- # Helpers (kept local so this script is self-contained) # ---------------------------------------------------------------------- function Get-Sha256Hex([byte[]]$bytes) { $sha = [System.Security.Cryptography.SHA256]::Create() try { ($sha.ComputeHash($bytes) | ForEach-Object { $_.ToString('x2') }) -join '' } finally { $sha.Dispose() } } function Gzip-Bytes([byte[]]$bytes) { $ms = New-Object System.IO.MemoryStream $gz = New-Object System.IO.Compression.GZipStream($ms, [System.IO.Compression.CompressionMode]::Compress) $gz.Write($bytes, 0, $bytes.Length) $gz.Close() $out = $ms.ToArray() $ms.Dispose() return $out } function Gunzip-Bytes([byte[]]$bytes) { $msIn = New-Object System.IO.MemoryStream(,$bytes) $gz = New-Object System.IO.Compression.GZipStream($msIn, [System.IO.Compression.CompressionMode]::Decompress) $msOut = New-Object System.IO.MemoryStream $buf = New-Object byte[] 8192 while ($true) { $n = $gz.Read($buf, 0, $buf.Length) if ($n -le 0) { break } $msOut.Write($buf, 0, $n) } $gz.Close() $msIn.Dispose() $out = $msOut.ToArray() $msOut.Dispose() return $out } # ---------------------------------------------------------------------- # Pull in the V3 encoder/decoder by dot-sourcing run_benchmark_v3.ps1. # That script writes outputs of its own when run directly; to avoid that, # we dot-source it inside a helper that just defines its functions. # # The trick: dot-sourcing executes the whole script. To suppress the # side-effects (the main loop at the bottom), we parse out just the # function definitions. Simpler approach: re-define the encoder/decoder # inline here so this file is fully self-contained and doesn't depend # on side-effects in the other script. # ---------------------------------------------------------------------- # Reserved-byte table (must match V3 encoder exactly) $TIER1_BYTES = @(0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08, 0x0B,0x0C, 0x0E,0x0F,0x10,0x11,0x12,0x13) $TIER2_ESC_A = 0x14 $TIER2_ESC_B = 0x15 $TIER3_ESC = 0x16 $LITERAL_ESC = 0x17 $RESERVED = New-Object 'System.Collections.Generic.HashSet[byte]' foreach ($b in $TIER1_BYTES) { [void]$RESERVED.Add([byte]$b) } [void]$RESERVED.Add([byte]$TIER2_ESC_A) [void]$RESERVED.Add([byte]$TIER2_ESC_B) [void]$RESERVED.Add([byte]$TIER3_ESC) [void]$RESERVED.Add([byte]$LITERAL_ESC) $PLACEHOLDER_FIRST = 0xE000 function Bytes-To-Latin1String([byte[]]$bytes) { $enc = [System.Text.Encoding]::GetEncoding(28591) return $enc.GetString($bytes) } function Latin1String-To-Bytes([string]$s) { $enc = [System.Text.Encoding]::GetEncoding(28591) return $enc.GetBytes($s) } function Count-Occurrences([string]$haystack, [string]$needle) { if ([string]::IsNullOrEmpty($needle)) { return 0 } $count = 0; $idx = 0; $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { break } $count++; $idx = $found + $nlen } return $count } function Replace-All-NonOverlap([string]$haystack, [string]$needle, [string]$replacement) { if ([string]::IsNullOrEmpty($needle)) { return $haystack } $sb = New-Object System.Text.StringBuilder $idx = 0; $nlen = $needle.Length while ($true) { $found = $haystack.IndexOf($needle, $idx, [System.StringComparison]::Ordinal) if ($found -lt 0) { [void]$sb.Append($haystack.Substring($idx)); break } if ($found -gt $idx) { [void]$sb.Append($haystack.Substring($idx, $found - $idx)) } [void]$sb.Append($replacement) $idx = $found + $nlen } return $sb.ToString() } function Encode-V3 { param([byte[]]$inputBytes, [int[]]$phraseLengths, [int]$maxTier1, [int]$maxTier2, [int]$maxTier3) $totalSlots = $maxTier1 + $maxTier2 + $maxTier3 $working = Bytes-To-Latin1String $inputBytes $accepted = New-Object 'System.Collections.Generic.List[string]' $placeholderIdx = 0 $maxIterations = $totalSlots + 4 for ($iter = 0; $iter -lt $maxIterations; $iter++) { if ($accepted.Count -ge $totalSlots) { break } $tentativeTokenCost = 2 $bestText = $null; $bestNet = 0; $bestLen = 0; $bestCount = 0 foreach ($L in $phraseLengths) { if ($L -lt 3) { continue } if ($working.Length -lt ($L * 2)) { continue } $counts = New-Object 'System.Collections.Generic.Dictionary[string,int]' $upper = $working.Length - $L for ($i = 0; $i -le $upper; $i++) { $sub = $working.Substring($i, $L) $hasPlaceholder = $false for ($k = 0; $k -lt $L; $k++) { if ([int]$sub[$k] -ge 0xE000) { $hasPlaceholder = $true; break } } if ($hasPlaceholder) { continue } if ($counts.ContainsKey($sub)) { $counts[$sub] = $counts[$sub] + 1 } else { $counts[$sub] = 1 } } foreach ($kv in $counts.GetEnumerator()) { if ($kv.Value -lt 2) { continue } $net = (($L - $tentativeTokenCost) * $kv.Value) - (1 + $L) if ($net -gt $bestNet) { $bestNet = $net; $bestText = $kv.Key; $bestLen = $L; $bestCount = $kv.Value } } } if ($null -eq $bestText -or $bestNet -le 0) { break } if ($placeholderIdx -gt 0x1FFF) { break } $phChar = [char]($PLACEHOLDER_FIRST + $placeholderIdx); $placeholderIdx++ $accepted.Add($bestText) $working = Replace-All-NonOverlap $working $bestText ([string]$phChar) } $replay = Bytes-To-Latin1String $inputBytes $entryInfo = New-Object 'System.Collections.Generic.List[object]' $placeholderIdx2 = 0 foreach ($text in $accepted) { $count = Count-Occurrences $replay $text $phChar = [char]($PLACEHOLDER_FIRST + $placeholderIdx2); $placeholderIdx2++ $replay = Replace-All-NonOverlap $replay $text ([string]$phChar) $entryInfo.Add(@{ Text = $text; Length = $text.Length; Count = $count; Placeholder = $phChar }) } $ranked = $entryInfo | Sort-Object -Descending { $_.Length * $_.Count } $rankedArr = @($ranked) for ($r = 0; $r -lt $rankedArr.Count; $r++) { $tier = 0 if ($r -lt $maxTier1) { $tier = 1 } elseif ($r -lt ($maxTier1 + $maxTier2)) { $tier = 2 } elseif ($r -lt ($maxTier1 + $maxTier2 + $maxTier3)) { $tier = 3 } $rankedArr[$r].Tier = $tier if ($tier -eq 1) { $rankedArr[$r].TokenBytes = @([byte]$TIER1_BYTES[$r]) } elseif ($tier -eq 2) { $t2idx = $r - $maxTier1 if ($t2idx -lt 256) { $rankedArr[$r].TokenBytes = @([byte]$TIER2_ESC_A, [byte]$t2idx) } else { $rankedArr[$r].TokenBytes = @([byte]$TIER2_ESC_B, [byte]($t2idx - 256)) } } elseif ($tier -eq 3) { $t3idx = $r - $maxTier1 - $maxTier2 $lo = [byte]($t3idx -band 0xFF); $hi = [byte](($t3idx -shr 8) -band 0xFF) $rankedArr[$r].TokenBytes = @([byte]$TIER3_ESC, $lo, $hi) } else { $rankedArr[$r].TokenBytes = $null } } $bodyStr = Bytes-To-Latin1String $inputBytes $tokenLookup = @{} foreach ($e in $rankedArr) { if ($null -eq $e.TokenBytes) { continue } $bodyStr = Replace-All-NonOverlap $bodyStr $e.Text ([string]$e.Placeholder) $tokenLookup[$e.Placeholder] = $e.TokenBytes } $bodyOut = New-Object System.Collections.Generic.List[byte] for ($i = 0; $i -lt $bodyStr.Length; $i++) { $ch = $bodyStr[$i]; $code = [int]$ch if ($code -ge $PLACEHOLDER_FIRST) { $tb = $tokenLookup[$ch] foreach ($b in $tb) { [void]$bodyOut.Add([byte]$b) } } else { $b = [byte]$code if ($RESERVED.Contains($b)) { [void]$bodyOut.Add([byte]$LITERAL_ESC); [void]$bodyOut.Add($b) } else { [void]$bodyOut.Add($b) } } } $t1 = @($rankedArr | Where-Object { $_.Tier -eq 1 }) $t2 = @($rankedArr | Where-Object { $_.Tier -eq 2 }) $t3 = @($rankedArr | Where-Object { $_.Tier -eq 3 }) $hdr = New-Object System.Collections.Generic.List[byte] foreach ($c in [byte[]]([System.Text.Encoding]::ASCII.GetBytes('OCC3'))) { [void]$hdr.Add($c) } [void]$hdr.Add([byte]$t1.Count) foreach ($e in $t1) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t2.Count -band 0xFF)); [void]$hdr.Add([byte](($t2.Count -shr 8) -band 0xFF)) foreach ($e in $t2) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } [void]$hdr.Add([byte]($t3.Count -band 0xFF)); [void]$hdr.Add([byte](($t3.Count -shr 8) -band 0xFF)) foreach ($e in $t3) { $bb = Latin1String-To-Bytes $e.Text; [void]$hdr.Add([byte]$bb.Length); foreach ($x in $bb) { [void]$hdr.Add($x) } } $bl = $bodyOut.Count [void]$hdr.Add([byte]($bl -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 8) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 16) -band 0xFF)) [void]$hdr.Add([byte](($bl -shr 24) -band 0xFF)) foreach ($x in $bodyOut) { [void]$hdr.Add($x) } return @{ encodedBytes = $hdr.ToArray() } } function Decode-V3([byte[]]$enc) { $i = 0 if ($enc.Length -lt 4) { throw "encoded too short" } if ([System.Text.Encoding]::ASCII.GetString($enc, 0, 4) -ne 'OCC3') { throw "bad magic" } $i = 4 $tier1 = New-Object System.Collections.Generic.List[byte[]] $tier2 = New-Object System.Collections.Generic.List[byte[]] $tier3 = New-Object System.Collections.Generic.List[byte[]] $t1c = $enc[$i]; $i++ for ($k = 0; $k -lt $t1c; $k++) { $len = $enc[$i]; $i++; $b = New-Object byte[] $len; [Array]::Copy($enc, $i, $b, 0, $len); $tier1.Add($b); $i += $len } $t2c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t2c; $k++) { $len = $enc[$i]; $i++; $b = New-Object byte[] $len; [Array]::Copy($enc, $i, $b, 0, $len); $tier2.Add($b); $i += $len } $t3c = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8); $i += 2 for ($k = 0; $k -lt $t3c; $k++) { $len = $enc[$i]; $i++; $b = New-Object byte[] $len; [Array]::Copy($enc, $i, $b, 0, $len); $tier3.Add($b); $i += $len } $bl = [int]$enc[$i] -bor ([int]$enc[$i+1] -shl 8) -bor ([int]$enc[$i+2] -shl 16) -bor ([int]$enc[$i+3] -shl 24); $i += 4 $bodyEnd = $i + $bl $out = New-Object System.Collections.Generic.List[byte] $j = $i while ($j -lt $bodyEnd) { $b = $enc[$j] $isT1 = $false for ($t = 0; $t -lt $TIER1_BYTES.Count; $t++) { if ($b -eq [byte]$TIER1_BYTES[$t]) { if ($t -lt $tier1.Count) { foreach ($x in $tier1[$t]) { [void]$out.Add($x) }; $isT1 = $true } break } } if ($isT1) { $j++; continue } if ($b -eq $TIER2_ESC_A) { $idx = [int]$enc[$j+1]; if ($idx -lt $tier2.Count) { foreach ($x in $tier2[$idx]) { [void]$out.Add($x) } }; $j += 2; continue } if ($b -eq $TIER2_ESC_B) { $idx = [int]$enc[$j+1] + 256; if ($idx -lt $tier2.Count) { foreach ($x in $tier2[$idx]) { [void]$out.Add($x) } }; $j += 2; continue } if ($b -eq $TIER3_ESC) { $idx = [int]$enc[$j+1] -bor ([int]$enc[$j+2] -shl 8); if ($idx -lt $tier3.Count) { foreach ($x in $tier3[$idx]) { [void]$out.Add($x) } }; $j += 3; continue } if ($b -eq $LITERAL_ESC) { [void]$out.Add($enc[$j+1]); $j += 2; continue } [void]$out.Add($b); $j++ } return $out.ToArray() } # ---------------------------------------------------------------------- # Main: per-file stacked pipeline # ---------------------------------------------------------------------- Write-Output "OneCharacterCode V3 STACKED compression test" Write-Output "Pipeline per file: raw -> gzip(raw) AND raw -> OCC V3 -> gzip(OCC V3) -> gunzip -> decode -> SHA match" Write-Output "" $phraseLens = @(3,4,5,6,8,10,12,16,24,32,48,64,96,128) $inputs = Get-ChildItem -Path $inputsDir -File | Sort-Object Name if ($inputs.Count -eq 0) { throw "No inputs in $inputsDir" } $results = @() foreach ($f in $inputs) { Write-Output ("--- {0} ---" -f $f.Name) $raw = [System.IO.File]::ReadAllBytes($f.FullName) $rawSha = Get-Sha256Hex $raw # Baseline transport: gzip(raw) $gzipRaw = Gzip-Bytes $raw # Stacked transport: gzip(OCC V3) # Run encoder in the SAME adaptive way as the public V3 page: $modeA = Encode-V3 -inputBytes $raw -phraseLengths $phraseLens -maxTier1 16 -maxTier2 512 -maxTier3 0 $modeB = Encode-V3 -inputBytes $raw -phraseLengths $phraseLens -maxTier1 16 -maxTier2 512 -maxTier3 256 # Validate both, pick the smaller that passes $okA = $false; $okB = $false try { $okA = ((Get-Sha256Hex (Decode-V3 $modeA.encodedBytes)) -eq $rawSha) } catch { $okA = $false } try { $okB = ((Get-Sha256Hex (Decode-V3 $modeB.encodedBytes)) -eq $rawSha) } catch { $okB = $false } $occBytes = $null; $mode = 'none-pass' if ($okA -and $okB) { if ($modeA.encodedBytes.Length -le $modeB.encodedBytes.Length) { $occBytes = $modeA.encodedBytes; $mode = 'dict-only' } else { $occBytes = $modeB.encodedBytes; $mode = 'hybrid-3tier' } } elseif ($okA) { $occBytes = $modeA.encodedBytes; $mode = 'dict-only' } elseif ($okB) { $occBytes = $modeB.encodedBytes; $mode = 'hybrid-3tier' } else { $occBytes = $modeB.encodedBytes; $mode = 'none-pass' } $gzipOcc = Gzip-Bytes $occBytes # FULL ROUND-TRIP via the stacked transport: # bytes you would actually transmit = gzipOcc # receiver does: gunzip -> decode V3 -> recover raw $roundTripOk = $false $reconSha = $null try { $ungz = Gunzip-Bytes $gzipOcc $rec = Decode-V3 $ungz $reconSha = Get-Sha256Hex $rec $roundTripOk = ($reconSha -eq $rawSha) [System.IO.File]::WriteAllBytes((Join-Path $outputsDir ($f.Name + '.occ3.gz')), $gzipOcc) [System.IO.File]::WriteAllBytes((Join-Path $outputsDir ($f.Name + '.reconstructed_from_gzip_occ3')), $rec) } catch { $roundTripOk = $false $reconSha = $null } # Write byproducts [System.IO.File]::WriteAllBytes((Join-Path $outputsDir ($f.Name + '.gz')), $gzipRaw) [System.IO.File]::WriteAllBytes((Join-Path $outputsDir ($f.Name + '.occ3')), $occBytes) $winner = '' $message = '' if ($gzipOcc.Length -lt $gzipRaw.Length) { $winner = 'OCC V3 + gzip' $message = 'OCC V3 + gzip wins for this file.' } else { $winner = 'Gzip(raw)' $message = 'Gzip(raw) still wins for this file.' } $stackedVsRawPct = if ($gzipRaw.Length -gt 0) { [math]::Round((1.0 - ($gzipOcc.Length / [double]$gzipRaw.Length)) * 100.0, 2) } else { 0 } $shaMatch = if ($null -ne $reconSha) { ($reconSha -eq $rawSha) } else { $false } $rtStatus = if ($roundTripOk) { 'PASS' } else { 'FAIL' } $results += [pscustomobject]@{ input_file = $f.Name raw_bytes = $raw.Length gzip_raw_bytes = $gzipRaw.Length occ_v3_bytes = $occBytes.Length occ_v3_mode = $mode gzip_occ_v3_bytes = $gzipOcc.Length best_transport_winner = $winner winner_message = $message gzip_occ_vs_gzip_raw_pct = $stackedVsRawPct roundtrip_status = $rtStatus sha256_match = $shaMatch raw_sha256 = $rawSha reconstructed_sha256 = $reconSha } Write-Output (" raw = {0,8} bytes" -f $raw.Length) Write-Output (" gzip(raw) = {0,8} bytes" -f $gzipRaw.Length) Write-Output (" OCC V3 = {0,8} bytes ({1})" -f $occBytes.Length, $mode) Write-Output (" gzip(OCC V3) = {0,8} bytes ({1}% vs gzip(raw))" -f $gzipOcc.Length, $stackedVsRawPct) Write-Output (" winner : {0}" -f $message) Write-Output (" roundtrip : {0} sha-match: {1}" -f $rtStatus, $shaMatch) Write-Output "" } # ---------------------------------------------------------------------- # Write JSON + text reports # ---------------------------------------------------------------------- $now = (Get-Date).ToString('o') $machine = $env:COMPUTERNAME $psv = $PSVersionTable.PSVersion.ToString() $pkg = [pscustomobject]@{ schema_version = 'occ-stacked-v3' generated_at = $now machine_name = $machine ps_version = $psv description = 'Stacked transport-size test: gzip(raw) vs gzip(OCC V3 carrier). Roundtrip = gunzip -> decode V3 -> SHA-256 match.' disclosure = "This test answers Bret Fencl's correction: if OneCharacterCode data is transmitted, it can still be compressed again with gzip. The fair transport comparison is gzip(raw) versus gzip(OCC carrier)." rule = 'No OCC win unless gzip(OCC V3) < gzip(raw) AND roundtrip PASS.' results = $results } $json = $pkg | ConvertTo-Json -Depth 10 $enc8 = New-Object System.Text.UTF8Encoding $false [System.IO.File]::WriteAllText((Join-Path $scriptDir 'stacked-compression-results-v3.json'), $json, $enc8) # Text report $txt = New-Object System.Text.StringBuilder [void]$txt.AppendLine('OneCharacterCode V3 STACKED compression test - run report') [void]$txt.AppendLine("Generated: $now") [void]$txt.AppendLine("Machine: $machine PowerShell: $psv") [void]$txt.AppendLine('') [void]$txt.AppendLine("Pipeline per file:") [void]$txt.AppendLine(" baseline transport: raw -> gzip(raw)") [void]$txt.AppendLine(" stacked transport: raw -> OCC V3 -> gzip(OCC V3)") [void]$txt.AppendLine(" receiver: gzip(OCC V3) -> gunzip -> decode V3 -> raw") [void]$txt.AppendLine(" SHA-256 of original vs SHA-256 of reconstructed must match.") [void]$txt.AppendLine('') [void]$txt.AppendLine(("{0,-32} {1,10} {2,10} {3,10} {4,10} {5,12} {6,10} {7,7} {8,5}" -f 'File','Raw','GzRaw','OCCV3','GzOCCV3','Winner','Stacked%','Recon','SHA')) foreach ($r in $results) { $winShort = if ($r.best_transport_winner -eq 'Gzip(raw)') { 'gzip(raw)' } else { 'OCC+gzip' } $shaShort = if ($r.sha256_match) { 'OK' } else { 'NO' } [void]$txt.AppendLine(("{0,-32} {1,10} {2,10} {3,10} {4,10} {5,12} {6,9}% {7,7} {8,5}" -f $r.input_file, $r.raw_bytes, $r.gzip_raw_bytes, $r.occ_v3_bytes, $r.gzip_occ_v3_bytes, $winShort, $r.gzip_occ_vs_gzip_raw_pct, $r.roundtrip_status, $shaShort)) } [void]$txt.AppendLine('') [void]$txt.AppendLine('Per-file detail:') foreach ($r in $results) { [void]$txt.AppendLine('') [void]$txt.AppendLine($r.input_file) [void]$txt.AppendLine((" raw = {0} bytes" -f $r.raw_bytes)) [void]$txt.AppendLine((" gzip(raw) = {0} bytes" -f $r.gzip_raw_bytes)) [void]$txt.AppendLine((" OCC V3 = {0} bytes ({1})" -f $r.occ_v3_bytes, $r.occ_v3_mode)) [void]$txt.AppendLine((" gzip(OCC V3) = {0} bytes ({1}% vs gzip(raw))" -f $r.gzip_occ_v3_bytes, $r.gzip_occ_vs_gzip_raw_pct)) [void]$txt.AppendLine((" winner : {0}" -f $r.winner_message)) [void]$txt.AppendLine((" roundtrip : {0}" -f $r.roundtrip_status)) [void]$txt.AppendLine((" sha-256 : raw={0}" -f $r.raw_sha256)) [void]$txt.AppendLine((" rec={0}" -f $r.reconstructed_sha256)) } [void]$txt.AppendLine('') [void]$txt.AppendLine('End of report.') [System.IO.File]::WriteAllText((Join-Path $scriptDir 'stacked-compression-test-run-v3.txt'), $txt.ToString(), $enc8) Write-Output 'Wrote: stacked-compression-results-v3.json' Write-Output 'Wrote: stacked-compression-test-run-v3.txt' Write-Output 'Done.'