Fix Kokoro ZH model shape mismatch and add mixed language support

Alex-Wengg · claude · Alex-Wengg · commit 026f05390534 · 2026-01-25T11:39:47.000-05:00
- Fix check_array_shape to properly detect MLX vs PyTorch conv weight formats - Update weight sanitization in kokoro.py and istftnet.py to use format detection - Add Chinese-to-Bopomofo conversion using pypinyin for ZH model compatibility - Add number-to-Chinese conversion for proper TTS of numeric content - Add mixed Chinese/English text processing in pipeline - Update tests for check_array_shape function Fixes #226 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/mlx_audio/base.py b/mlx_audio/base.py
@@ -16,23 +16,68 @@ def from_dict(cls, params):
 
 
 def check_array_shape(arr):
+    """
+    Check if a conv weight array is already in MLX format.
+
+    For 1D convolutions:
+        MLX format: (out_channels, kernel_size, in_channels)
+        PyTorch format: (out_channels, in_channels, kernel_size)
+
+    For 2D convolutions:
+        MLX format: (out_channels, kH, kW, in_channels)
+        PyTorch format: (out_channels, in_channels, kH, kW)
+
+    Returns True if the array appears to be in MLX format (no transpose needed).
+    Returns False if the array appears to be in PyTorch format (needs transpose).
+
+    Heuristic: kernel dimensions are typically small (1, 3, 5, 7, 9, 11),
+    while channel dimensions are typically larger (64, 128, 256, 512, etc.).
+    """
     shape = arr.shape
 
-    # Check if the shape has 4 dimensions
+    # Common kernel sizes for convolutions
+    KERNEL_SIZE_THRESHOLD = 15
+
     if len(shape) == 4:
-        out_channels, kH, KW, _ = shape
-        # Check if out_channels is the largest, and kH and KW are the same
-        if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
+        # 2D convolution: check if dims 1,2 are kernel-like (small) vs dim 3 being channel-like
+        out_channels, dim1, dim2, dim3 = shape
+        # MLX format: (out_channels, kH, kW, in_channels) - dim1, dim2 are small kernels
+        # PyTorch format: (out_channels, in_channels, kH, kW) - dim3, dim2 are small kernels
+        if dim1 <= KERNEL_SIZE_THRESHOLD and dim2 <= KERNEL_SIZE_THRESHOLD and dim3 > KERNEL_SIZE_THRESHOLD:
+            return True  # MLX format
+        elif dim2 <= KERNEL_SIZE_THRESHOLD and dim3 <= KERNEL_SIZE_THRESHOLD and dim1 > KERNEL_SIZE_THRESHOLD:
+            return False  # PyTorch format
+        # Fallback to original logic for ambiguous cases
+        if (out_channels >= dim1) and (out_channels >= dim2) and (dim1 == dim2):
             return True
-        else:
-            return False
-    # Check if the shape has 3 dimensions
+        return False
+
     elif len(shape) == 3:
-        _, kW, out_channels = shape
-        # Check if out_channels is the largest
-        if kW >= out_channels:
-            return True
-        else:
-            return False
+        # 1D convolution: (out_channels, kernel_size, in_channels) for MLX
+        #                 (out_channels, in_channels, kernel_size) for PyTorch
+        out_channels, dim1, dim2 = shape
+        # If middle dim is small (kernel-like) and last dim is large (channel-like): MLX format
+        if dim1 <= KERNEL_SIZE_THRESHOLD and dim2 > KERNEL_SIZE_THRESHOLD:
+            return True  # MLX format
+        # If last dim is small (kernel-like) and middle dim is large (channel-like): PyTorch format
+        elif dim2 <= KERNEL_SIZE_THRESHOLD and dim1 > KERNEL_SIZE_THRESHOLD:
+            return False  # PyTorch format
+
+        # Ambiguous case: both dims are small (both could be kernel-like)
+        # Special handling when one dim is 1:
+        # - in_channels=1 is common for certain operations
+        # - kernel_size=1 (pointwise conv) is less common than kernel_size=3,5,7
+        # So if dim1=1 and dim2>1, assume dim1 is in_channels (PyTorch format)
+        if dim1 == 1 and dim2 > 1:
+            return False  # Assume PyTorch format: (out, in=1, kernel)
+        if dim2 == 1 and dim1 > 1:
+            return True  # Assume MLX format: (out, kernel, in=1)
+
+        # Both dims are similar and neither is 1
+        # Kernel is typically smaller than or equal to in_channels
+        if dim1 <= dim2:
+            return True  # Assume MLX format (kernel in middle is smaller or equal)
+        return False  # Assume PyTorch format
+
     else:
         return False
diff --git a/mlx_audio/tts/models/base.py b/mlx_audio/tts/models/base.py
@@ -19,19 +19,51 @@ def from_dict(cls, params):
 
 
 def check_array_shape(arr):
+    """
+    Check if a conv weight array is already in MLX format.
+
+    For 1D convolutions:
+        MLX format: (out_channels, kernel_size, in_channels)
+        PyTorch format: (out_channels, in_channels, kernel_size)
+
+    Returns True if the array appears to be in MLX format (no transpose needed).
+    Returns False if the array appears to be in PyTorch format (needs transpose).
+
+    Heuristic: kernel dimensions are typically small (1, 3, 5, 7, 9, 11),
+    while channel dimensions are typically larger (64, 128, 256, 512, etc.).
+    """
     shape = arr.shape
 
-    # Check if the shape has 4 dimensions
+    # Common kernel sizes for convolutions
+    KERNEL_SIZE_THRESHOLD = 15
+
     if len(shape) != 3:
         return False
 
-    out_channels, kH, KW = shape
-
-    # Check if out_channels is the largest, and kH and KW are the same
-    if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
-        return True
-    else:
-        return False
+    out_channels, dim1, dim2 = shape
+
+    # If middle dim is small (kernel-like) and last dim is large (channel-like): MLX format
+    if dim1 <= KERNEL_SIZE_THRESHOLD and dim2 > KERNEL_SIZE_THRESHOLD:
+        return True  # MLX format
+    # If last dim is small (kernel-like) and middle dim is large (channel-like): PyTorch format
+    elif dim2 <= KERNEL_SIZE_THRESHOLD and dim1 > KERNEL_SIZE_THRESHOLD:
+        return False  # PyTorch format
+
+    # Ambiguous case: both dims are small (both could be kernel-like)
+    # Special handling when one dim is 1:
+    # - in_channels=1 is common for certain operations
+    # - kernel_size=1 (pointwise conv) is less common than kernel_size=3,5,7
+    # So if dim1=1 and dim2>1, assume dim1 is in_channels (PyTorch format)
+    if dim1 == 1 and dim2 > 1:
+        return False  # Assume PyTorch format: (out, in=1, kernel)
+    if dim2 == 1 and dim1 > 1:
+        return True  # Assume MLX format: (out, kernel, in=1)
+
+    # Both dims are similar and neither is 1
+    # Kernel is typically smaller than or equal to in_channels
+    if dim1 <= dim2:
+        return True  # Assume MLX format (kernel in middle is smaller or equal)
+    return False  # Assume PyTorch format
 
 
 def adjust_speed(audio_array, speed_factor):
diff --git a/mlx_audio/tts/models/kokoro/istftnet.py b/mlx_audio/tts/models/kokoro/istftnet.py
@@ -965,7 +965,11 @@ def __call__(self, asr, F0_curve, N, s):
     def sanitize(self, key, weights):
         sanitized_weights = None
         if "noise_convs" in key and key.endswith(".weight"):
-            sanitized_weights = weights.transpose(0, 2, 1)
+            # Only transpose if in PyTorch format
+            if check_array_shape(weights):
+                sanitized_weights = weights
+            else:
+                sanitized_weights = weights.transpose(0, 2, 1)
 
         elif "weight_v" in key:
             if check_array_shape(weights):
diff --git a/mlx_audio/tts/models/kokoro/kokoro.py b/mlx_audio/tts/models/kokoro/kokoro.py
@@ -209,11 +209,12 @@ def sanitize(self, weights):
                     sanitized_weights[key] = state_dict
 
             if key.startswith("predictor"):
-                if "F0_proj.weight" in key:
-                    sanitized_weights[key] = state_dict.transpose(0, 2, 1)
-
-                elif "N_proj.weight" in key:
-                    sanitized_weights[key] = state_dict.transpose(0, 2, 1)
+                if "F0_proj.weight" in key or "N_proj.weight" in key:
+                    # Only transpose if in PyTorch format
+                    if check_array_shape(state_dict):
+                        sanitized_weights[key] = state_dict
+                    else:
+                        sanitized_weights[key] = state_dict.transpose(0, 2, 1)
 
                 elif "weight_v" in key:
                     if check_array_shape(state_dict):
diff --git a/mlx_audio/tts/models/kokoro/pipeline.py b/mlx_audio/tts/models/kokoro/pipeline.py
@@ -115,12 +115,21 @@ def __init__(
                 raise
         elif lang_code == "z":
             try:
-                from misaki import zh
-
-                self.g2p = zh.ZHG2P()
+                from pypinyin import pinyin, Style
+
+                self.pinyin = pinyin
+                self.pinyin_style = Style
+                # Also initialize English G2P for mixed Chinese/English text
+                try:
+                    self.en_g2p = en.G2P(trf=False, fallback=None, unk="")
+                except Exception as e:
+                    logging.warning(f"English G2P not available for mixed text: {e}")
+                    self.en_g2p = None
+                # Use a simple wrapper as g2p for compatibility
+                self.g2p = lambda text: (self._chinese_to_bopomofo(text), None)
             except ImportError:
                 logging.error(
-                    "You need to `pip install misaki[zh]` to use lang_code='z'"
+                    "You need to `pip install pypinyin` to use lang_code='z'"
                 )
                 raise
         else:
@@ -190,6 +199,154 @@ def load_voice(self, voice: str, delimiter: str = ",") -> mx.array:
         self.voices[voice] = mx.mean(mx.stack(packs), axis=0)
         return self.voices[voice]
 
+    def _number_to_chinese(self, num_str: str) -> str:
+        """Convert Arabic numerals to Chinese characters.
+
+        Examples:
+            "23" -> "二十三"
+            "100" -> "一百"
+            "1000" -> "一千"
+        """
+        digits = "零一二三四五六七八九"
+        units = ["", "十", "百", "千"]
+        big_units = ["", "万", "亿"]
+
+        if not num_str:
+            return ""
+
+        # Handle decimal numbers
+        if "." in num_str:
+            integer_part, decimal_part = num_str.split(".", 1)
+            integer_chinese = self._number_to_chinese(integer_part) if integer_part else ""
+            decimal_chinese = "".join(digits[int(d)] for d in decimal_part)
+            return f"{integer_chinese}点{decimal_chinese}"
+
+        num = int(num_str)
+        if num == 0:
+            return "零"
+
+        if num < 0:
+            return "负" + self._number_to_chinese(str(-num))
+
+        result = ""
+        unit_index = 0
+
+        while num > 0:
+            section = num % 10000
+            if section > 0:
+                section_str = ""
+                for i, unit in enumerate(units):
+                    digit = section % 10
+                    section = section // 10
+                    if digit > 0:
+                        section_str = digits[digit] + unit + section_str
+                    elif section_str and not section_str.startswith("零"):
+                        section_str = "零" + section_str
+                    if section == 0:
+                        break
+                result = section_str + big_units[unit_index] + result
+            num = num // 10000
+            unit_index += 1
+
+        # Special case: 10-19 don't need leading "一"
+        if result.startswith("一十"):
+            result = result[1:]
+
+        return result
+
+    def _chinese_to_bopomofo(self, text: str) -> str:
+        """Convert Chinese text to Bopomofo with numeric tones.
+
+        The Kokoro ZH model expects Bopomofo symbols with numeric tones (1-5).
+        """
+        # Tone mark to number mapping
+        tone_map = {
+            "\u02ca": "2",  # ˊ tone 2
+            "\u02c7": "3",  # ˇ tone 3
+            "\u02cb": "4",  # ˋ tone 4
+            "\u02d9": "5",  # ˙ neutral tone
+        }
+
+        # First, convert numbers to Chinese characters
+        # Match sequences of digits (including decimals)
+        text = re.sub(
+            r"(\d+\.?\d*)",
+            lambda m: self._number_to_chinese(m.group(1)),
+            text,
+        )
+
+        result = []
+        for char in text:
+            # Chinese character range
+            if "\u4e00" <= char <= "\u9fff":
+                bpmf = self.pinyin(char, style=self.pinyin_style.BOPOMOFO)[0][0]
+
+                # Extract tone mark and convert to number
+                tone = "1"  # default tone 1
+                clean_bpmf = ""
+                for c in bpmf:
+                    if c in tone_map:
+                        tone = tone_map[c]
+                    else:
+                        clean_bpmf += c
+
+                result.append(clean_bpmf + tone)
+            elif char.isascii() and char.isalpha():
+                # English letters - will be processed separately
+                result.append(char)
+            else:
+                # Punctuation and other characters
+                result.append(char)
+
+        return " ".join(result)
+
+    def _process_mixed_zh_en(self, text: str) -> str:
+        """Process mixed Chinese/English text by using appropriate G2P for each part.
+
+        Args:
+            text: Input text containing Chinese and/or English
+
+        Returns:
+            Combined phoneme string with proper phonemes for both languages
+        """
+        # Pattern to match English sequences (letters, spaces, and common punctuation)
+        pattern = r"([a-zA-Z][a-zA-Z\s,.'\"!\?\-]*)"
+
+        parts = re.split(pattern, text)
+        phonemes = []
+
+        for part in parts:
+            if not part.strip():
+                continue
+
+            # Check if this part starts with English letter
+            if re.match(r"^[a-zA-Z]", part):
+                # Process as English
+                if self.en_g2p:
+                    try:
+                        _, tokens = self.en_g2p(part)
+                        ps = "".join(
+                            t.phonemes + (" " if t.whitespace else "")
+                            for t in tokens
+                            if t.phonemes
+                        )
+                        if ps.strip():
+                            phonemes.append(ps.strip())
+                    except Exception as e:
+                        logging.warning(f"English G2P failed for '{part}': {e}")
+                        # Keep English as-is if G2P fails
+                        phonemes.append(part.strip())
+                else:
+                    # No English G2P available, keep as-is
+                    phonemes.append(part.strip())
+            else:
+                # Process as Chinese using Bopomofo
+                ps = self._chinese_to_bopomofo(part)
+                if ps.strip():
+                    phonemes.append(ps.strip())
+
+        return " ".join(phonemes)
+
     @classmethod
     def tokens_to_ps(cls, tokens: List[en.MToken]) -> str:
         return "".join(
@@ -470,7 +627,12 @@ def __call__(
                     if not chunk.strip():
                         continue
 
-                    ps, _ = self.g2p(chunk)
+                    # For Chinese, use mixed language processing if English G2P is available
+                    if self.lang_code == "z" and hasattr(self, "en_g2p") and self.en_g2p:
+                        ps = self._process_mixed_zh_en(chunk)
+                    else:
+                        ps, _ = self.g2p(chunk)
+
                     if not ps:
                         continue
                     elif len(ps) > 510:
diff --git a/mlx_audio/tts/tests/test_base.py b/mlx_audio/tts/tests/test_base.py