Extract statistical rules from TIMIT
First pass, llm generated
def get_tuples(filename):
words = []
with open(filename) as wf:
for line in wf.readlines():
line = line.strip()
parts = line.split(" ")
if len(parts) != 3:
continue
words.append((int(parts[0]), int(parts[1]), parts[2]))
return words
def filter_junk(phones):
filtered = []
for phone in phones:
if phone[2].endswith("cl"):
continue
if phone[2] in ["epi", "pau", "h#"]:
continue
if phone[2] == "ax-h":
filtered.append((phone[0], phone[1], "ax"))
else:
filtered.append(phone)
return filtered
def get_phonetic_words(filename):
if filename.endswith(".WRD"):
wordfile = filename
phonfile = wordfile.replace(".WRD", ".PHN")
elif filename.endswith(".PHN"):
phonfile = filename
wordfile = phonfile.replace(".PHN", ".WRD")
else:
return None
words = get_tuples(wordfile)
phones = get_tuples(phonfile)
phones = filter_junk(phones)
def in_word(phone, word):
return (phone[0] >= word[0]) and (phone[1] <= word[1])
merged = []
i = j = 0
while i < len(words):
word = words[i]
current = {
"start": word[0],
"end": word[1],
"word": word[2],
"phones": []
}
while j < len(phones):
phone = phones[j]
if in_word(phone, word):
current["phones"].append(phone[2])
j += 1
elif phone[0] >= word[1]:
# Phone starts at or after word end - move to next word
break
else:
# Phone starts before word but doesn't fit - skip it
j += 1
merged.append(current)
i += 1
return merged
result = get_phonetic_words("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/TRAIN/DR1/FCJF0/SA1.WRD")
for item in result:
print(f"{item['word']}: {' '.join(item['phones'])}")
import glob
from collections import defaultdict
# Collect all pronunciations from the corpus
BASE_PATH = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data"
word_pronunciations = defaultdict(list)
for wrd_file in glob.glob(f"{BASE_PATH}/**/*.WRD", recursive=True):
phonetic_words = get_phonetic_words(wrd_file)
if phonetic_words:
for item in phonetic_words:
word = item["word"].lower()
phones = tuple(item["phones"])
word_pronunciations[word].append(phones)
print(f"Collected pronunciations for {len(word_pronunciations)} unique words")
def load_timit_dict(filename):
"""Parse TIMITDIC.TXT format: word /pronunciation/"""
timit_dict = {}
with open(filename) as f:
for line in f:
line = line.strip()
if not line or line.startswith(";"):
continue
# Format: word /p r o n u n c i a t i o n/
if "/" not in line:
continue
word_part, pron_part = line.split("/", 1)
word = word_part.strip().lower()
pron = tuple(pron_part.rstrip("/").strip().split())
timit_dict[word] = pron
return timit_dict
# Try common locations for the dictionary
dict_paths = [
f"{BASE_PATH}/TIMITDIC.TXT",
f"{BASE_PATH}/../TIMITDIC.TXT",
"/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/TIMITDIC.TXT",
]
timit_dict = None
for path in dict_paths:
try:
timit_dict = load_timit_dict(path)
print(f"Loaded dictionary from {path}: {len(timit_dict)} entries")
break
except FileNotFoundError:
continue
if timit_dict is None:
print("TIMIT dictionary not found - listing available files...")
import os
for item in os.listdir("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/"):
print(f" {item}")
def align_sequences(ref, hyp):
"""
Align two phone sequences using dynamic programming.
Returns list of operations: ('match', r, h), ('sub', r, h), ('del', r, None), ('ins', None, h)
"""
m, n = len(ref), len(hyp)
# DP table
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
# Fill table
for i in range(1, m + 1):
for j in range(1, n + 1):
if ref[i-1] == hyp[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = 1 + min(
dp[i-1][j], # deletion
dp[i][j-1], # insertion
dp[i-1][j-1] # substitution
)
# Backtrace
ops = []
i, j = m, n
while i > 0 or j > 0:
if i > 0 and j > 0 and ref[i-1] == hyp[j-1]:
ops.append(('match', ref[i-1], hyp[j-1]))
i -= 1
j -= 1
elif i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + 1:
ops.append(('sub', ref[i-1], hyp[j-1]))
i -= 1
j -= 1
elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
ops.append(('del', ref[i-1], None))
i -= 1
else:
ops.append(('ins', None, hyp[j-1]))
j -= 1
return list(reversed(ops))
# Test alignment
ref = ('dh', 'ax', 's', 't', 'ao', 'r')
hyp = ('dh', 'ix', 's', 'ao', 'r')
print("Reference:", ref)
print("Hypothesis:", hyp)
print("Alignment:")
for op in align_sequences(ref, hyp):
print(f" {op}")
from collections import Counter
substitutions = Counter() # (ref_phone, actual_phone) -> count
deletions = Counter() # ref_phone -> count
insertions = Counter() # actual_phone -> count
matches = Counter() # phone -> count (for computing rates)
words_analyzed = 0
words_not_in_dict = 0
if timit_dict:
for word, pronunciations in word_pronunciations.items():
if word not in timit_dict:
words_not_in_dict += 1
continue
dict_pron = timit_dict[word]
for actual_pron in pronunciations:
words_analyzed += 1
alignment = align_sequences(dict_pron, actual_pron)
for op, ref_phone, actual_phone in alignment:
if op == 'match':
matches[ref_phone] += 1
elif op == 'sub':
substitutions[(ref_phone, actual_phone)] += 1
elif op == 'del':
deletions[ref_phone] += 1
elif op == 'ins':
insertions[actual_phone] += 1
print(f"Words analyzed: {words_analyzed}")
print(f"Words not in dictionary: {words_not_in_dict}")
print(f"\nUnique substitution types: {len(substitutions)}")
print(f"Unique deletions: {len(deletions)}")
print(f"Unique insertions: {len(insertions)}")
else:
print("Cannot analyze - dictionary not loaded")
print("=== Top 20 Substitutions ===")
for (ref, actual), count in substitutions.most_common(20):
# Compute rate: how often this phone gets this substitution vs staying the same
total_occurrences = matches[ref] + sum(c for (r, _), c in substitutions.items() if r == ref)
rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
print(f" {ref} -> {actual}: {count} ({rate:.1f}%)")
print("\n=== Top 20 Deletions ===")
for phone, count in deletions.most_common(20):
total_occurrences = matches[phone] + deletions[phone] + sum(c for (r, _), c in substitutions.items() if r == phone)
rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
print(f" {phone} deleted: {count} ({rate:.1f}%)")
print("\n=== Top 20 Insertions ===")
for phone, count in insertions.most_common(20):
print(f" {phone} inserted: {count}")
# Format suitable for applying to CMUdict
def compute_transformation_rules(matches, substitutions, deletions, min_count=5, min_rate=1.0):
"""
Compute transformation rules from the collected statistics.
Returns dict: phone -> list of (target, probability) where target can be a phone or None (deletion)
"""
rules = {}
# Get all phones that appear in the reference
all_ref_phones = set(matches.keys())
all_ref_phones.update(r for r, _ in substitutions.keys())
all_ref_phones.update(deletions.keys())
for phone in all_ref_phones:
# Total occurrences of this phone in reference
total = matches[phone]
total += deletions.get(phone, 0)
total += sum(c for (r, _), c in substitutions.items() if r == phone)
if total == 0:
continue
transformations = []
# Add substitutions
for (ref, actual), count in substitutions.items():
if ref == phone and count >= min_count:
rate = count / total * 100
if rate >= min_rate:
transformations.append((actual, count, rate))
# Add deletions
del_count = deletions.get(phone, 0)
if del_count >= min_count:
rate = del_count / total * 100
if rate >= min_rate:
transformations.append((None, del_count, rate))
if transformations:
# Sort by count descending
transformations.sort(key=lambda x: -x[1])
rules[phone] = transformations
return rules
rules = compute_transformation_rules(matches, substitutions, deletions)
print("=== Transformation Rules (min 5 occurrences, min 1% rate) ===")
for phone, transforms in sorted(rules.items()):
print(f"\n{phone}:")
for target, count, rate in transforms:
if target is None:
print(f" -> ∅ (delete): {count} ({rate:.1f}%)")
else:
print(f" -> {target}: {count} ({rate:.1f}%)")
# TIMIT uses a slightly different phoneset than CMUdict
TIMIT_TO_ARPABET = {
# Vowels - TIMIT often has more distinctions
'ax': 'AH', # schwa
'ix': 'IH', # reduced high front (often schwa-like)
'ux': 'UW', # reduced high back
'axr': 'ER', # schwa + r
'ax-h': 'AH', # breathy schwa
'em': 'M', # syllabic m (CMU doesn't have this)
'en': 'N', # syllabic n (CMU doesn't have this)
'eng': 'NG', # syllabic ng
'el': 'L', # syllabic l (CMU doesn't have this)
'nx': 'N', # flap (alveolar nasal)
'dx': 'D', # flap (often realized as D or T)
'q': '', # glottal stop (not in CMU)
'hv': 'HH', # voiced h
# Direct mappings (lowercase to uppercase)
'aa': 'AA', 'ae': 'AE', 'ah': 'AH', 'ao': 'AO', 'aw': 'AW',
'ay': 'AY', 'eh': 'EH', 'er': 'ER', 'ey': 'EY', 'ih': 'IH',
'iy': 'IY', 'ow': 'OW', 'oy': 'OY', 'uh': 'UH', 'uw': 'UW',
'b': 'B', 'ch': 'CH', 'd': 'D', 'dh': 'DH', 'f': 'F', 'g': 'G',
'hh': 'HH', 'jh': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N',
'ng': 'NG', 'p': 'P', 'r': 'R', 's': 'S', 'sh': 'SH', 't': 'T',
'th': 'TH', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z', 'zh': 'ZH',
}
def timit_to_arpabet(timit_phones):
"""Convert TIMIT phone sequence to ARPABET (CMUdict format)"""
result = []
for phone in timit_phones:
mapped = TIMIT_TO_ARPABET.get(phone, phone.upper())
if mapped: # Skip empty mappings (like glottal stop)
result.append(mapped)
return tuple(result)
# Convert rules to ARPABET
arpabet_rules = {}
for phone, transforms in rules.items():
src = TIMIT_TO_ARPABET.get(phone, phone.upper())
if not src:
continue
if src not in arpabet_rules:
arpabet_rules[src] = []
for target, count, rate in transforms:
if target is None:
arpabet_rules[src].append((None, count, rate))
else:
tgt = TIMIT_TO_ARPABET.get(target, target.upper())
if tgt and tgt != src: # Don't add identity mappings
arpabet_rules[src].append((tgt, count, rate))
print("=== Rules in ARPABET format ===")
for phone, transforms in sorted(arpabet_rules.items()):
if transforms:
print(f"\n{phone}:")
for target, count, rate in transforms:
if target is None:
print(f" -> ∅ (delete): {count} ({rate:.1f}%)")
else:
print(f" -> {target}: {count} ({rate:.1f}%)")
import json
export_rules = {}
for phone, transforms in arpabet_rules.items():
if transforms:
export_rules[phone] = [
{"target": t, "count": c, "rate": round(r, 2)}
for t, c, r in transforms
]
with open("timit_transformation_rules.json", "w") as f:
json.dump(export_rules, f, indent=2)
print(f"Exported {len(export_rules)} phone rules to timit_transformation_rules.json")
def generate_variants(pronunciation, rules, max_variants=10):
"""
Generate pronunciation variants by applying transformation rules.
Uses a simple approach: apply one rule at a time to generate variants.
"""
variants = set()
variants.add(tuple(pronunciation))
for i, phone in enumerate(pronunciation):
# Strip stress markers for lookup
phone_base = ''.join(c for c in phone if not c.isdigit())
if phone_base in rules:
for rule in rules[phone_base]:
target = rule["target"]
# Create variant
new_pron = list(pronunciation)
if target is None:
# Deletion
new_pron = new_pron[:i] + new_pron[i+1:]
else:
# Preserve stress marker if present
stress = ''.join(c for c in phone if c.isdigit())
new_pron[i] = target + stress
variants.add(tuple(new_pron))
if len(variants) >= max_variants:
break
if len(variants) >= max_variants:
break
return list(variants)
# Example with a word
example_pron = ['W', 'AO1', 'T', 'ER0'] # "water" in CMUdict format
print(f"Base pronunciation: {' '.join(example_pron)}")
print("Variants:")
for var in generate_variants(example_pron, export_rules):
print(f" {' '.join(var)}")