Original

def get_tuples(filename):
    words = []
    with open(filename) as wf:
        for line in wf.readlines():
            line = line.strip()
            parts = line.split(" ")
            if len(parts) != 3:
                continue
            words.append((int(parts[0]), int(parts[1]), parts[2]))
    return words

def filter_junk(phones):
    filtered = []
    for phone in phones:
        if phone[2].endswith("cl"):
            continue
        if phone[2] in ["epi", "pau", "h#"]:
            continue
        if phone[2] == "ax-h":
            filtered.append((phone[0], phone[1], "ax"))
        else:
            filtered.append(phone)
    return filtered

def get_phonetic_words(filename):
    if filename.endswith(".WRD"):
        wordfile = filename
        phonfile = wordfile.replace(".WRD", ".PHN")
    elif filename.endswith(".PHN"):
        phonfile = filename
        wordfile = phonfile.replace(".PHN", ".WRD")
    else:
        return None
    
    words = get_tuples(wordfile)
    phones = get_tuples(phonfile)
    phones = filter_junk(phones)

    def in_word(phone, word):
        return (phone[0] >= word[0]) and (phone[1] <= word[1])
    
    merged = []
    
    i = j = 0
    while i < len(words):
        word = words[i]
        current = {
            "start": word[0],
            "end": word[1],
            "word": word[2],
            "phones": []
        }
        while j < len(phones):
            phone = phones[j]
            if in_word(phone, word):
                current["phones"].append(phone[2])
                j += 1
            elif phone[0] >= word[1]:
                # Phone starts at or after word end - move to next word
                break
            else:
                # Phone starts before word but doesn't fit - skip it
                j += 1
        merged.append(current)
        i += 1

    return merged

result = get_phonetic_words("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/TRAIN/DR1/FCJF0/SA1.WRD")
for item in result:
    print(f"{item['word']}: {' '.join(item['phones'])}")

she: sh ix
had: hv eh jh
your: ih
dark: d ah k
suit: s ux q
in: en
greasy: g r ix s ix
wash: w ao sh
water: w ao dx axr
all: ao l
year: y ih axr

import glob
from collections import defaultdict

# Collect all pronunciations from the corpus
BASE_PATH = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data"
word_pronunciations = defaultdict(list)

for wrd_file in glob.glob(f"{BASE_PATH}/**/*.WRD", recursive=True):
    phonetic_words = get_phonetic_words(wrd_file)
    if phonetic_words:
        for item in phonetic_words:
            word = item["word"].lower()
            phones = tuple(item["phones"])
            word_pronunciations[word].append(phones)

print(f"Collected pronunciations for {len(word_pronunciations)} unique words")

Collected pronunciations for 6102 unique words

def load_timit_dict(filename):
    """Parse TIMITDIC.TXT format: word /pronunciation/"""
    timit_dict = {}
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith(";"):
                continue
            # Format: word /p r o n u n c i a t i o n/
            if "/" not in line:
                continue
            word_part, pron_part = line.split("/", 1)
            word = word_part.strip().lower()
            pron = tuple(pron_part.rstrip("/").strip().split())
            timit_dict[word] = pron
    return timit_dict

# Try common locations for the dictionary
dict_paths = [
    f"{BASE_PATH}/TIMITDIC.TXT",
    f"{BASE_PATH}/../TIMITDIC.TXT",
    "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/TIMITDIC.TXT",
]

timit_dict = None
for path in dict_paths:
    try:
        timit_dict = load_timit_dict(path)
        print(f"Loaded dictionary from {path}: {len(timit_dict)} entries")
        break
    except FileNotFoundError:
        continue

if timit_dict is None:
    print("TIMIT dictionary not found - listing available files...")
    import os
    for item in os.listdir("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/"):
        print(f"  {item}")

Loaded dictionary from /kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/../TIMITDIC.TXT: 6229 entries

def align_sequences(ref, hyp):
    """
    Align two phone sequences using dynamic programming.
    Returns list of operations: ('match', r, h), ('sub', r, h), ('del', r, None), ('ins', None, h)
    """
    m, n = len(ref), len(hyp)
    
    # DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref[i-1] == hyp[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(
                    dp[i-1][j],      # deletion
                    dp[i][j-1],      # insertion
                    dp[i-1][j-1]     # substitution
                )
    
    # Backtrace
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        if i > 0 and j > 0 and ref[i-1] == hyp[j-1]:
            ops.append(('match', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + 1:
            ops.append(('sub', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
            ops.append(('del', ref[i-1], None))
            i -= 1
        else:
            ops.append(('ins', None, hyp[j-1]))
            j -= 1
    
    return list(reversed(ops))

# Test alignment
ref = ('dh', 'ax', 's', 't', 'ao', 'r')
hyp = ('dh', 'ix', 's', 'ao', 'r')
print("Reference:", ref)
print("Hypothesis:", hyp)
print("Alignment:")
for op in align_sequences(ref, hyp):
    print(f"  {op}")

Reference: ('dh', 'ax', 's', 't', 'ao', 'r')
Hypothesis: ('dh', 'ix', 's', 'ao', 'r')
Alignment:
  ('match', 'dh', 'dh')
  ('sub', 'ax', 'ix')
  ('match', 's', 's')
  ('del', 't', None)
  ('match', 'ao', 'ao')
  ('match', 'r', 'r')

from collections import Counter

substitutions = Counter()  # (ref_phone, actual_phone) -> count
deletions = Counter()      # ref_phone -> count
insertions = Counter()     # actual_phone -> count
matches = Counter()        # phone -> count (for computing rates)

words_analyzed = 0
words_not_in_dict = 0

if timit_dict:
    for word, pronunciations in word_pronunciations.items():
        if word not in timit_dict:
            words_not_in_dict += 1
            continue
        
        dict_pron = timit_dict[word]
        
        for actual_pron in pronunciations:
            words_analyzed += 1
            alignment = align_sequences(dict_pron, actual_pron)
            
            for op, ref_phone, actual_phone in alignment:
                if op == 'match':
                    matches[ref_phone] += 1
                elif op == 'sub':
                    substitutions[(ref_phone, actual_phone)] += 1
                elif op == 'del':
                    deletions[ref_phone] += 1
                elif op == 'ins':
                    insertions[actual_phone] += 1

    print(f"Words analyzed: {words_analyzed}")
    print(f"Words not in dictionary: {words_not_in_dict}")
    print(f"\nUnique substitution types: {len(substitutions)}")
    print(f"Unique deletions: {len(deletions)}")
    print(f"Unique insertions: {len(insertions)}")
else:
    print("Cannot analyze - dictionary not loaded")

Words analyzed: 54233
Words not in dictionary: 27

Unique substitution types: 1122
Unique deletions: 71
Unique insertions: 42

print("=== Top 20 Substitutions ===")
for (ref, actual), count in substitutions.most_common(20):
    # Compute rate: how often this phone gets this substitution vs staying the same
    total_occurrences = matches[ref] + sum(c for (r, _), c in substitutions.items() if r == ref)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {ref} -> {actual}: {count} ({rate:.1f}%)")

print("\n=== Top 20 Deletions ===")
for phone, count in deletions.most_common(20):
    total_occurrences = matches[phone] + deletions[phone] + sum(c for (r, _), c in substitutions.items() if r == phone)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {phone} deleted: {count} ({rate:.1f}%)")

print("\n=== Top 20 Insertions ===")
for phone, count in insertions.most_common(20):
    print(f"  {phone} inserted: {count}")

=== Top 20 Substitutions ===
  iy1 -> iy: 4366 (91.1%)
  ih1 -> ih: 3693 (64.1%)
  ae1 -> ae: 3645 (66.0%)
  ao1 -> ao: 3364 (77.7%)
  t -> dx: 2592 (20.9%)
  ax -> ix: 2551 (32.2%)
  eh1 -> eh: 2524 (89.4%)
  aa1 -> aa: 2412 (92.1%)
  ay1 -> ay: 2003 (91.5%)
  ey1 -> ey: 1991 (96.0%)
  ow1 -> ow: 1873 (96.8%)
  uw1 -> ux: 1821 (50.7%)
  ih1 -> ix: 1679 (29.1%)
  ah1 -> ah: 1679 (85.8%)
  r -> axr: 1551 (13.3%)
  hh -> hv: 1306 (51.0%)
  ae1 -> eh: 1201 (21.8%)
  ih -> ix: 1110 (53.0%)
  d -> dx: 886 (12.1%)
  er1 -> er: 875 (83.8%)

=== Top 20 Deletions ===
  t deleted: 1393 (10.1%)
  ae1 deleted: 1312 (19.2%)
  d deleted: 1306 (15.1%)
  uh1 deleted: 1049 (71.1%)
  ih1 deleted: 891 (13.4%)
  k deleted: 803 (9.5%)
  hh deleted: 753 (22.7%)
  ay1 deleted: 697 (24.2%)
  aa1 deleted: 626 (19.3%)
  ax deleted: 592 (7.0%)
  ao1 deleted: 583 (11.9%)
  y deleted: 540 (19.3%)
  eh1 deleted: 497 (15.0%)
  ix deleted: 477 (11.4%)
  s deleted: 458 (4.5%)
  ow1 deleted: 431 (18.2%)
  r deleted: 423 (3.5%)
  uw1 deleted: 330 (8.4%)
  ah1 deleted: 322 (14.1%)
  n deleted: 291 (2.5%)

=== Top 20 Insertions ===
  q inserted: 2110
  ax inserted: 219
  ix inserted: 135
  t inserted: 77
  r inserted: 65
  hh inserted: 53
  ao inserted: 52
  ay inserted: 36
  y inserted: 33
  ih inserted: 32
  s inserted: 30
  w inserted: 24
  aa inserted: 24
  ah inserted: 17
  g inserted: 16
  ey inserted: 13
  er inserted: 13
  eh inserted: 12
  ae inserted: 11
  n inserted: 9

# Format suitable for applying to CMUdict

def compute_transformation_rules(matches, substitutions, deletions, min_count=5, min_rate=1.0):
    """
    Compute transformation rules from the collected statistics.
    Returns dict: phone -> list of (target, probability) where target can be a phone or None (deletion)
    """
    rules = {}
    
    # Get all phones that appear in the reference
    all_ref_phones = set(matches.keys())
    all_ref_phones.update(r for r, _ in substitutions.keys())
    all_ref_phones.update(deletions.keys())
    
    for phone in all_ref_phones:
        # Total occurrences of this phone in reference
        total = matches[phone]
        total += deletions.get(phone, 0)
        total += sum(c for (r, _), c in substitutions.items() if r == phone)
        
        if total == 0:
            continue
        
        transformations = []
        
        # Add substitutions
        for (ref, actual), count in substitutions.items():
            if ref == phone and count >= min_count:
                rate = count / total * 100
                if rate >= min_rate:
                    transformations.append((actual, count, rate))
        
        # Add deletions
        del_count = deletions.get(phone, 0)
        if del_count >= min_count:
            rate = del_count / total * 100
            if rate >= min_rate:
                transformations.append((None, del_count, rate))
        
        if transformations:
            # Sort by count descending
            transformations.sort(key=lambda x: -x[1])
            rules[phone] = transformations
    
    return rules

rules = compute_transformation_rules(matches, substitutions, deletions)

print("=== Transformation Rules (min 5 occurrences, min 1% rate) ===")
for phone, transforms in sorted(rules.items()):
    print(f"\n{phone}:")
    for target, count, rate in transforms:
        if target is None:
            print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
        else:
            print(f"  -> {target}: {count} ({rate:.1f}%)")

=== Transformation Rules (min 5 occurrences, min 1% rate) ===

aa:
  -> ∅ (delete): 16 (19.5%)
  -> ay: 9 (11.0%)
  -> ax: 5 (6.1%)

aa1:
  -> aa: 2412 (74.3%)
  -> ∅ (delete): 626 (19.3%)
  -> ah: 58 (1.8%)
  -> ao: 56 (1.7%)

aa2:
  -> aa: 190 (79.5%)
  -> ∅ (delete): 26 (10.9%)
  -> ah: 14 (5.9%)
  -> ao: 5 (2.1%)

ae:
  -> ix: 712 (44.4%)
  -> ∅ (delete): 199 (12.4%)
  -> eh: 176 (11.0%)
  -> q: 112 (7.0%)
  -> ax: 111 (6.9%)
  -> ih: 80 (5.0%)

ae1:
  -> ae: 3645 (53.4%)
  -> ∅ (delete): 1312 (19.2%)
  -> eh: 1201 (17.6%)
  -> ix: 187 (2.7%)
  -> hv: 152 (2.2%)
  -> ih: 128 (1.9%)

ae2:
  -> ae: 235 (59.8%)
  -> ∅ (delete): 68 (17.3%)
  -> eh: 53 (13.5%)
  -> aa: 12 (3.1%)
  -> q: 7 (1.8%)

ah:
  -> ∅ (delete): 29 (20.7%)
  -> ax: 20 (14.3%)
  -> ix: 10 (7.1%)

ah1:
  -> ah: 1679 (73.7%)
  -> ∅ (delete): 322 (14.1%)
  -> ax: 115 (5.0%)
  -> axr: 43 (1.9%)
  -> ix: 32 (1.4%)
  -> uh: 24 (1.1%)

ah2:
  -> ah: 142 (69.3%)
  -> ∅ (delete): 33 (16.1%)
  -> ax: 14 (6.8%)

ao:
  -> k: 38 (25.3%)
  -> ∅ (delete): 36 (24.0%)
  -> s: 7 (4.7%)
  -> ax: 6 (4.0%)
  -> axr: 6 (4.0%)

ao1:
  -> ao: 3364 (68.4%)
  -> aa: 697 (14.2%)
  -> ∅ (delete): 583 (11.9%)
  -> ah: 71 (1.4%)

ao2:
  -> ao: 140 (84.3%)
  -> ∅ (delete): 9 (5.4%)
  -> hv: 5 (3.0%)

aw1:
  -> aw: 700 (74.3%)
  -> ∅ (delete): 119 (12.6%)
  -> aa: 72 (7.6%)
  -> th: 12 (1.3%)

aw2:
  -> aw: 40 (78.4%)
  -> ∅ (delete): 10 (19.6%)

ax:
  -> ix: 2551 (30.0%)
  -> ∅ (delete): 592 (7.0%)
  -> ah: 560 (6.6%)
  -> iy: 382 (4.5%)
  -> ih: 366 (4.3%)
  -> axr: 268 (3.1%)
  -> ey: 187 (2.2%)
  -> eh: 107 (1.3%)

axr:
  -> er: 805 (22.7%)
  -> ax: 113 (3.2%)
  -> r: 98 (2.8%)
  -> ix: 73 (2.1%)
  -> ∅ (delete): 64 (1.8%)

ay:
  -> ix: 8 (5.7%)
  -> ah: 5 (3.6%)
  -> aa: 5 (3.6%)

ay1:
  -> ay: 2003 (69.4%)
  -> ∅ (delete): 697 (24.2%)
  -> aa: 103 (3.6%)

ay2:
  -> ay: 284 (77.8%)
  -> ∅ (delete): 59 (16.2%)
  -> aa: 9 (2.5%)

b:
  -> ∅ (delete): 102 (2.9%)
  -> aa: 76 (2.2%)
  -> ah: 53 (1.5%)
  -> ae: 40 (1.2%)

ch:
  -> sh: 30 (2.8%)

d:
  -> ∅ (delete): 1306 (15.1%)
  -> dx: 886 (10.3%)
  -> jh: 302 (3.5%)
  -> ae: 162 (1.9%)
  -> eh: 150 (1.7%)
  -> ih: 149 (1.7%)
  -> uh: 120 (1.4%)
  -> n: 104 (1.2%)
  -> ix: 93 (1.1%)

dh:
  -> ∅ (delete): 244 (5.5%)
  -> th: 232 (5.3%)

eh:
  -> ix: 66 (41.0%)
  -> ∅ (delete): 21 (13.0%)
  -> ih: 12 (7.5%)
  -> ax: 10 (6.2%)
  -> axr: 7 (4.3%)

eh1:
  -> eh: 2524 (76.0%)
  -> ∅ (delete): 497 (15.0%)
  -> ih: 110 (3.3%)
  -> ix: 61 (1.8%)

eh2:
  -> eh: 206 (66.2%)
  -> ∅ (delete): 52 (16.7%)
  -> ix: 15 (4.8%)
  -> ih: 7 (2.3%)
  -> ax: 6 (1.9%)

el:
  -> l: 216 (16.8%)
  -> ax: 17 (1.3%)

em:
  -> m: 72 (87.8%)

en:
  -> n: 100 (28.6%)
  -> ix: 25 (7.1%)
  -> ih: 7 (2.0%)
  -> t: 5 (1.4%)

er:
  -> axr: 134 (55.4%)
  -> r: 8 (3.3%)
  -> ax: 6 (2.5%)

er1:
  -> er: 875 (75.7%)
  -> axr: 137 (11.9%)
  -> ∅ (delete): 112 (9.7%)

er2:
  -> er: 30 (48.4%)
  -> axr: 27 (43.5%)

ey:
  -> ix: 17 (15.7%)
  -> iy: 14 (13.0%)
  -> ih: 7 (6.5%)

ey1:
  -> ey: 1991 (84.3%)
  -> ∅ (delete): 288 (12.2%)
  -> eh: 28 (1.2%)

ey2:
  -> ey: 366 (77.4%)
  -> ∅ (delete): 78 (16.5%)
  -> w: 6 (1.3%)

f:
  -> ∅ (delete): 44 (1.4%)

g:
  -> ae: 169 (5.1%)
  -> ∅ (delete): 89 (2.7%)
  -> ih: 75 (2.3%)
  -> ix: 43 (1.3%)

hh:
  -> hv: 1306 (39.4%)
  -> ∅ (delete): 753 (22.7%)

ih:
  -> ix: 1110 (47.8%)
  -> ∅ (delete): 230 (9.9%)
  -> ax: 197 (8.5%)
  -> iy: 197 (8.5%)
  -> axr: 91 (3.9%)

ih1:
  -> ih: 3693 (55.5%)
  -> ix: 1679 (25.2%)
  -> ∅ (delete): 891 (13.4%)
  -> ax: 147 (2.2%)
  -> iy: 93 (1.4%)

ih2:
  -> ih: 200 (45.6%)
  -> ix: 137 (31.2%)
  -> ∅ (delete): 57 (13.0%)
  -> ax: 17 (3.9%)
  -> iy: 10 (2.3%)
  -> q: 5 (1.1%)

ix:
  -> ih: 485 (11.6%)
  -> ∅ (delete): 477 (11.4%)
  -> ax: 364 (8.7%)
  -> iy: 305 (7.3%)
  -> dx: 58 (1.4%)
  -> eh: 53 (1.3%)
  -> axr: 49 (1.2%)
  -> ah: 46 (1.1%)

iy:
  -> ix: 506 (10.9%)
  -> ih: 218 (4.7%)

iy1:
  -> iy: 4366 (86.5%)
  -> ∅ (delete): 253 (5.0%)
  -> ix: 251 (5.0%)
  -> ih: 142 (2.8%)

iy2:
  -> iy: 119 (79.9%)
  -> ∅ (delete): 16 (10.7%)
  -> ih: 7 (4.7%)
  -> ix: 7 (4.7%)

jh:
  -> zh: 34 (2.6%)
  -> ∅ (delete): 23 (1.8%)

k:
  -> ∅ (delete): 803 (9.5%)
  -> ay: 444 (5.2%)
  -> ae: 144 (1.7%)
  -> eh: 142 (1.7%)
  -> ix: 125 (1.5%)

l:
  -> el: 216 (2.6%)

m:
  -> em: 147 (2.6%)
  -> ∅ (delete): 89 (1.6%)

n:
  -> nx: 755 (6.4%)
  -> en: 638 (5.4%)
  -> ow: 305 (2.6%)
  -> ∅ (delete): 291 (2.5%)
  -> ng: 247 (2.1%)

ng:
  -> n: 115 (7.0%)
  -> eng: 23 (1.4%)

ow:
  -> w: 31 (11.8%)
  -> ax: 20 (7.6%)
  -> ix: 9 (3.4%)
  -> uh: 7 (2.7%)
  -> axr: 7 (2.7%)
  -> ah: 6 (2.3%)
  -> ∅ (delete): 6 (2.3%)
  -> uw: 5 (1.9%)

ow1:
  -> ow: 1873 (79.2%)
  -> ∅ (delete): 431 (18.2%)

ow2:
  -> ow: 292 (81.1%)
  -> ∅ (delete): 33 (9.2%)
  -> ax: 11 (3.1%)
  -> uh: 6 (1.7%)

oy1:
  -> oy: 830 (83.7%)
  -> ao: 73 (7.4%)
  -> ow: 45 (4.5%)
  -> ∅ (delete): 25 (2.5%)

oy2:
  -> oy: 56 (90.3%)
  -> ∅ (delete): 6 (9.7%)

p:
  -> ∅ (delete): 121 (3.0%)
  -> ah: 89 (2.2%)
  -> ih: 63 (1.6%)
  -> aa: 61 (1.5%)
  -> iy: 42 (1.0%)

r:
  -> axr: 1551 (12.9%)
  -> er: 727 (6.0%)
  -> ∅ (delete): 423 (3.5%)
  -> ao: 140 (1.2%)

s:
  -> ∅ (delete): 458 (4.5%)
  -> z: 144 (1.4%)

t:
  -> dx: 2592 (18.8%)
  -> ∅ (delete): 1393 (10.1%)
  -> q: 814 (5.9%)
  -> ae: 592 (4.3%)
  -> nx: 443 (3.2%)
  -> ux: 244 (1.8%)
  -> ix: 222 (1.6%)
  -> aa: 197 (1.4%)
  -> ih: 191 (1.4%)
  -> eh: 188 (1.4%)
  -> ey: 188 (1.4%)
  -> ay: 138 (1.0%)

th:
  -> ∅ (delete): 11 (1.4%)

uh:
  -> ix: 89 (42.8%)
  -> ∅ (delete): 56 (26.9%)
  -> ax: 25 (12.0%)
  -> ux: 13 (6.2%)
  -> ih: 10 (4.8%)

uh1:
  -> ∅ (delete): 1049 (71.1%)
  -> uh: 294 (19.9%)
  -> ao: 36 (2.4%)
  -> er: 27 (1.8%)
  -> ix: 21 (1.4%)

uh2:
  -> ∅ (delete): 33 (57.9%)
  -> uh: 18 (31.6%)

uw:
  -> ux: 114 (38.6%)
  -> ∅ (delete): 35 (11.9%)
  -> ix: 29 (9.8%)
  -> ax: 13 (4.4%)
  -> uh: 7 (2.4%)
  -> w: 6 (2.0%)

uw1:
  -> ux: 1821 (46.4%)
  -> ix: 813 (20.7%)
  -> uw: 557 (14.2%)
  -> ∅ (delete): 330 (8.4%)
  -> ax: 251 (6.4%)
  -> ih: 59 (1.5%)

uw2:
  -> ux: 130 (55.3%)
  -> uw: 39 (16.6%)
  -> ix: 24 (10.2%)
  -> ∅ (delete): 18 (7.7%)
  -> ax: 13 (5.5%)
  -> uh: 7 (3.0%)

v:
  -> ∅ (delete): 48 (1.7%)
  -> f: 42 (1.5%)

y:
  -> ∅ (delete): 540 (19.3%)

z:
  -> s: 620 (11.1%)
  -> zh: 56 (1.0%)

zh:
  -> jh: 24 (15.3%)

# TIMIT uses a slightly different phoneset than CMUdict
TIMIT_TO_ARPABET = {
    # Vowels - TIMIT often has more distinctions
    'ax': 'AH',      # schwa
    'ix': 'IH',      # reduced high front (often schwa-like)
    'ux': 'UW',      # reduced high back
    'axr': 'ER',     # schwa + r
    'ax-h': 'AH',    # breathy schwa
    'em': 'M',       # syllabic m (CMU doesn't have this)
    'en': 'N',       # syllabic n (CMU doesn't have this)  
    'eng': 'NG',     # syllabic ng
    'el': 'L',       # syllabic l (CMU doesn't have this)
    'nx': 'N',       # flap (alveolar nasal)
    'dx': 'D',       # flap (often realized as D or T)
    'q': '',         # glottal stop (not in CMU)
    'hv': 'HH',      # voiced h
    # Direct mappings (lowercase to uppercase)
    'aa': 'AA', 'ae': 'AE', 'ah': 'AH', 'ao': 'AO', 'aw': 'AW',
    'ay': 'AY', 'eh': 'EH', 'er': 'ER', 'ey': 'EY', 'ih': 'IH',
    'iy': 'IY', 'ow': 'OW', 'oy': 'OY', 'uh': 'UH', 'uw': 'UW',
    'b': 'B', 'ch': 'CH', 'd': 'D', 'dh': 'DH', 'f': 'F', 'g': 'G',
    'hh': 'HH', 'jh': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N',
    'ng': 'NG', 'p': 'P', 'r': 'R', 's': 'S', 'sh': 'SH', 't': 'T',
    'th': 'TH', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z', 'zh': 'ZH',
}

def timit_to_arpabet(timit_phones):
    """Convert TIMIT phone sequence to ARPABET (CMUdict format)"""
    result = []
    for phone in timit_phones:
        mapped = TIMIT_TO_ARPABET.get(phone, phone.upper())
        if mapped:  # Skip empty mappings (like glottal stop)
            result.append(mapped)
    return tuple(result)

# Convert rules to ARPABET
arpabet_rules = {}
for phone, transforms in rules.items():
    src = TIMIT_TO_ARPABET.get(phone, phone.upper())
    if not src:
        continue
    if src not in arpabet_rules:
        arpabet_rules[src] = []
    for target, count, rate in transforms:
        if target is None:
            arpabet_rules[src].append((None, count, rate))
        else:
            tgt = TIMIT_TO_ARPABET.get(target, target.upper())
            if tgt and tgt != src:  # Don't add identity mappings
                arpabet_rules[src].append((tgt, count, rate))

print("=== Rules in ARPABET format ===")
for phone, transforms in sorted(arpabet_rules.items()):
    if transforms:
        print(f"\n{phone}:")
        for target, count, rate in transforms:
            if target is None:
                print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
            else:
                print(f"  -> {target}: {count} ({rate:.1f}%)")

=== Rules in ARPABET format ===

AA:
  -> ∅ (delete): 16 (19.5%)
  -> AY: 9 (11.0%)
  -> AH: 5 (6.1%)

AA1:
  -> AA: 2412 (74.3%)
  -> ∅ (delete): 626 (19.3%)
  -> AH: 58 (1.8%)
  -> AO: 56 (1.7%)

AA2:
  -> AA: 190 (79.5%)
  -> ∅ (delete): 26 (10.9%)
  -> AH: 14 (5.9%)
  -> AO: 5 (2.1%)

AE:
  -> IH: 712 (44.4%)
  -> ∅ (delete): 199 (12.4%)
  -> EH: 176 (11.0%)
  -> AH: 111 (6.9%)
  -> IH: 80 (5.0%)

AE1:
  -> AE: 3645 (53.4%)
  -> ∅ (delete): 1312 (19.2%)
  -> EH: 1201 (17.6%)
  -> IH: 187 (2.7%)
  -> HH: 152 (2.2%)
  -> IH: 128 (1.9%)

AE2:
  -> AE: 235 (59.8%)
  -> ∅ (delete): 68 (17.3%)
  -> EH: 53 (13.5%)
  -> AA: 12 (3.1%)

AH:
  -> IH: 2551 (30.0%)
  -> ∅ (delete): 592 (7.0%)
  -> IY: 382 (4.5%)
  -> IH: 366 (4.3%)
  -> ER: 268 (3.1%)
  -> EY: 187 (2.2%)
  -> EH: 107 (1.3%)
  -> ∅ (delete): 29 (20.7%)
  -> IH: 10 (7.1%)

AH1:
  -> AH: 1679 (73.7%)
  -> ∅ (delete): 322 (14.1%)
  -> AH: 115 (5.0%)
  -> ER: 43 (1.9%)
  -> IH: 32 (1.4%)
  -> UH: 24 (1.1%)

AH2:
  -> AH: 142 (69.3%)
  -> ∅ (delete): 33 (16.1%)
  -> AH: 14 (6.8%)

AO:
  -> K: 38 (25.3%)
  -> ∅ (delete): 36 (24.0%)
  -> S: 7 (4.7%)
  -> AH: 6 (4.0%)
  -> ER: 6 (4.0%)

AO1:
  -> AO: 3364 (68.4%)
  -> AA: 697 (14.2%)
  -> ∅ (delete): 583 (11.9%)
  -> AH: 71 (1.4%)

AO2:
  -> AO: 140 (84.3%)
  -> ∅ (delete): 9 (5.4%)
  -> HH: 5 (3.0%)

AW1:
  -> AW: 700 (74.3%)
  -> ∅ (delete): 119 (12.6%)
  -> AA: 72 (7.6%)
  -> TH: 12 (1.3%)

AW2:
  -> AW: 40 (78.4%)
  -> ∅ (delete): 10 (19.6%)

AY:
  -> IH: 8 (5.7%)
  -> AH: 5 (3.6%)
  -> AA: 5 (3.6%)

AY1:
  -> AY: 2003 (69.4%)
  -> ∅ (delete): 697 (24.2%)
  -> AA: 103 (3.6%)

AY2:
  -> AY: 284 (77.8%)
  -> ∅ (delete): 59 (16.2%)
  -> AA: 9 (2.5%)

B:
  -> ∅ (delete): 102 (2.9%)
  -> AA: 76 (2.2%)
  -> AH: 53 (1.5%)
  -> AE: 40 (1.2%)

CH:
  -> SH: 30 (2.8%)

D:
  -> ∅ (delete): 1306 (15.1%)
  -> JH: 302 (3.5%)
  -> AE: 162 (1.9%)
  -> EH: 150 (1.7%)
  -> IH: 149 (1.7%)
  -> UH: 120 (1.4%)
  -> N: 104 (1.2%)
  -> IH: 93 (1.1%)

DH:
  -> ∅ (delete): 244 (5.5%)
  -> TH: 232 (5.3%)

EH:
  -> IH: 66 (41.0%)
  -> ∅ (delete): 21 (13.0%)
  -> IH: 12 (7.5%)
  -> AH: 10 (6.2%)
  -> ER: 7 (4.3%)

EH1:
  -> EH: 2524 (76.0%)
  -> ∅ (delete): 497 (15.0%)
  -> IH: 110 (3.3%)
  -> IH: 61 (1.8%)

EH2:
  -> EH: 206 (66.2%)
  -> ∅ (delete): 52 (16.7%)
  -> IH: 15 (4.8%)
  -> IH: 7 (2.3%)
  -> AH: 6 (1.9%)

ER:
  -> AH: 113 (3.2%)
  -> R: 98 (2.8%)
  -> IH: 73 (2.1%)
  -> ∅ (delete): 64 (1.8%)
  -> R: 8 (3.3%)
  -> AH: 6 (2.5%)

ER1:
  -> ER: 875 (75.7%)
  -> ER: 137 (11.9%)
  -> ∅ (delete): 112 (9.7%)

ER2:
  -> ER: 30 (48.4%)
  -> ER: 27 (43.5%)

EY:
  -> IH: 17 (15.7%)
  -> IY: 14 (13.0%)
  -> IH: 7 (6.5%)

EY1:
  -> EY: 1991 (84.3%)
  -> ∅ (delete): 288 (12.2%)
  -> EH: 28 (1.2%)

EY2:
  -> EY: 366 (77.4%)
  -> ∅ (delete): 78 (16.5%)
  -> W: 6 (1.3%)

F:
  -> ∅ (delete): 44 (1.4%)

G:
  -> AE: 169 (5.1%)
  -> ∅ (delete): 89 (2.7%)
  -> IH: 75 (2.3%)
  -> IH: 43 (1.3%)

HH:
  -> ∅ (delete): 753 (22.7%)

IH:
  -> ∅ (delete): 230 (9.9%)
  -> AH: 197 (8.5%)
  -> IY: 197 (8.5%)
  -> ER: 91 (3.9%)
  -> ∅ (delete): 477 (11.4%)
  -> AH: 364 (8.7%)
  -> IY: 305 (7.3%)
  -> D: 58 (1.4%)
  -> EH: 53 (1.3%)
  -> ER: 49 (1.2%)
  -> AH: 46 (1.1%)

IH1:
  -> IH: 3693 (55.5%)
  -> IH: 1679 (25.2%)
  -> ∅ (delete): 891 (13.4%)
  -> AH: 147 (2.2%)
  -> IY: 93 (1.4%)

IH2:
  -> IH: 200 (45.6%)
  -> IH: 137 (31.2%)
  -> ∅ (delete): 57 (13.0%)
  -> AH: 17 (3.9%)
  -> IY: 10 (2.3%)

IY:
  -> IH: 506 (10.9%)
  -> IH: 218 (4.7%)

IY1:
  -> IY: 4366 (86.5%)
  -> ∅ (delete): 253 (5.0%)
  -> IH: 251 (5.0%)
  -> IH: 142 (2.8%)

IY2:
  -> IY: 119 (79.9%)
  -> ∅ (delete): 16 (10.7%)
  -> IH: 7 (4.7%)
  -> IH: 7 (4.7%)

JH:
  -> ZH: 34 (2.6%)
  -> ∅ (delete): 23 (1.8%)

K:
  -> ∅ (delete): 803 (9.5%)
  -> AY: 444 (5.2%)
  -> AE: 144 (1.7%)
  -> EH: 142 (1.7%)
  -> IH: 125 (1.5%)

L:
  -> AH: 17 (1.3%)

M:
  -> ∅ (delete): 89 (1.6%)

N:
  -> OW: 305 (2.6%)
  -> ∅ (delete): 291 (2.5%)
  -> NG: 247 (2.1%)
  -> IH: 25 (7.1%)
  -> IH: 7 (2.0%)
  -> T: 5 (1.4%)

NG:
  -> N: 115 (7.0%)

OW:
  -> W: 31 (11.8%)
  -> AH: 20 (7.6%)
  -> IH: 9 (3.4%)
  -> UH: 7 (2.7%)
  -> ER: 7 (2.7%)
  -> AH: 6 (2.3%)
  -> ∅ (delete): 6 (2.3%)
  -> UW: 5 (1.9%)

OW1:
  -> OW: 1873 (79.2%)
  -> ∅ (delete): 431 (18.2%)

OW2:
  -> OW: 292 (81.1%)
  -> ∅ (delete): 33 (9.2%)
  -> AH: 11 (3.1%)
  -> UH: 6 (1.7%)

OY1:
  -> OY: 830 (83.7%)
  -> AO: 73 (7.4%)
  -> OW: 45 (4.5%)
  -> ∅ (delete): 25 (2.5%)

OY2:
  -> OY: 56 (90.3%)
  -> ∅ (delete): 6 (9.7%)

P:
  -> ∅ (delete): 121 (3.0%)
  -> AH: 89 (2.2%)
  -> IH: 63 (1.6%)
  -> AA: 61 (1.5%)
  -> IY: 42 (1.0%)

R:
  -> ER: 1551 (12.9%)
  -> ER: 727 (6.0%)
  -> ∅ (delete): 423 (3.5%)
  -> AO: 140 (1.2%)

S:
  -> ∅ (delete): 458 (4.5%)
  -> Z: 144 (1.4%)

T:
  -> D: 2592 (18.8%)
  -> ∅ (delete): 1393 (10.1%)
  -> AE: 592 (4.3%)
  -> N: 443 (3.2%)
  -> UW: 244 (1.8%)
  -> IH: 222 (1.6%)
  -> AA: 197 (1.4%)
  -> IH: 191 (1.4%)
  -> EH: 188 (1.4%)
  -> EY: 188 (1.4%)
  -> AY: 138 (1.0%)

TH:
  -> ∅ (delete): 11 (1.4%)

UH:
  -> IH: 89 (42.8%)
  -> ∅ (delete): 56 (26.9%)
  -> AH: 25 (12.0%)
  -> UW: 13 (6.2%)
  -> IH: 10 (4.8%)

UH1:
  -> ∅ (delete): 1049 (71.1%)
  -> UH: 294 (19.9%)
  -> AO: 36 (2.4%)
  -> ER: 27 (1.8%)
  -> IH: 21 (1.4%)

UH2:
  -> ∅ (delete): 33 (57.9%)
  -> UH: 18 (31.6%)

UW:
  -> ∅ (delete): 35 (11.9%)
  -> IH: 29 (9.8%)
  -> AH: 13 (4.4%)
  -> UH: 7 (2.4%)
  -> W: 6 (2.0%)

UW1:
  -> UW: 1821 (46.4%)
  -> IH: 813 (20.7%)
  -> UW: 557 (14.2%)
  -> ∅ (delete): 330 (8.4%)
  -> AH: 251 (6.4%)
  -> IH: 59 (1.5%)

UW2:
  -> UW: 130 (55.3%)
  -> UW: 39 (16.6%)
  -> IH: 24 (10.2%)
  -> ∅ (delete): 18 (7.7%)
  -> AH: 13 (5.5%)
  -> UH: 7 (3.0%)

V:
  -> ∅ (delete): 48 (1.7%)
  -> F: 42 (1.5%)

Y:
  -> ∅ (delete): 540 (19.3%)

Z:
  -> S: 620 (11.1%)
  -> ZH: 56 (1.0%)

ZH:
  -> JH: 24 (15.3%)

import json

export_rules = {}
for phone, transforms in arpabet_rules.items():
    if transforms:
        export_rules[phone] = [
            {"target": t, "count": c, "rate": round(r, 2)} 
            for t, c, r in transforms
        ]

with open("timit_transformation_rules.json", "w") as f:
    json.dump(export_rules, f, indent=2)
    
print(f"Exported {len(export_rules)} phone rules to timit_transformation_rules.json")

Exported 65 phone rules to timit_transformation_rules.json

def generate_variants(pronunciation, rules, max_variants=10):
    """
    Generate pronunciation variants by applying transformation rules.
    Uses a simple approach: apply one rule at a time to generate variants.
    """
    variants = set()
    variants.add(tuple(pronunciation))
    
    for i, phone in enumerate(pronunciation):
        # Strip stress markers for lookup
        phone_base = ''.join(c for c in phone if not c.isdigit())
        
        if phone_base in rules:
            for rule in rules[phone_base]:
                target = rule["target"]
                # Create variant
                new_pron = list(pronunciation)
                if target is None:
                    # Deletion
                    new_pron = new_pron[:i] + new_pron[i+1:]
                else:
                    # Preserve stress marker if present
                    stress = ''.join(c for c in phone if c.isdigit())
                    new_pron[i] = target + stress
                variants.add(tuple(new_pron))
                
                if len(variants) >= max_variants:
                    break
        
        if len(variants) >= max_variants:
            break
    
    return list(variants)

# Example with a word
example_pron = ['W', 'AO1', 'T', 'ER0']  # "water" in CMUdict format
print(f"Base pronunciation: {' '.join(example_pron)}")
print("Variants:")
for var in generate_variants(example_pron, export_rules):
    print(f"  {' '.join(var)}")

Base pronunciation: W AO1 T ER0
Variants:
  W S1 T ER0
  W K1 T ER0
  W AO1 AE ER0
  W AO1 D ER0
  W AH1 T ER0
  W AO1 ER0
  W T ER0
  W ER1 T ER0
  W AO1 N ER0
  W AO1 T ER0