Skip to content

Commit 309162d

Browse files
committed
Avoid conversion of consonant virama to chillus in some cases
പിഡബ്ള്യൂഡി, ബാറ്റ്സ്മാൻ
1 parent c3a7380 commit 309162d

File tree

2 files changed

+5
-2
lines changed

2 files changed

+5
-2
lines changed

libindic/normalizer/rules/normalizer.ml.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ common_mistakes: # Regex patterns for common mistakes in Malayalam raw corpus, A
5252
'\u200B': '' # Remove all Zero Width space characters
5353
'\u00AD': '' # Remove all soft hyphen characters
5454
'ര്(?![\s{PUNCTUATION}]|യ|$)': '' # Replace ര് with ർ when not at word end, string end and not followed by യ
55-
'റ്(?![\s{PUNCTUATION}\u200c]|യ|വ|ല|ര|റ|$)': '' # Replace റ് with ർ when not at word end, string end and not followed by റ, ര, വ, ല, യ
56-
'ള്(?![\s{PUNCTUATION}]|ള|$)': '' # Replace ള് with ൾ when not at word end, string end and not followed by ള
55+
'(?<!റ്)റ്(?![\s{PUNCTUATION}\u200c]|യ|വ|ല|ര|റ|$)': '' # Replace റ് with ർ when not at word end, string end and not followed by റ, ര, വ, ല, യ and not preceded by റ
56+
'ള്(?![\s{PUNCTUATION}]|ള|യ|$)': '' # Replace ള് with ൾ when not at word end, string end and not followed by ള
5757
'ദു:ഖ': 'ദുഃഖ' # Common Mistake
5858
'നമ:': 'നമഃ' # Common Mistake
5959
'(^|\s)ാ': '\1ആ' # Map vowel sign "ാ" to "ആ" at the beginning of a word

libindic/normalizer/tests/test_normalizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def test_normalize(self):
6060
self.assertEqual(normalize('കാറ്-'), 'കാറ്')
6161
self.assertEqual(normalize('കാറ് '), 'കാറ് ')
6262
self.assertEqual(normalize('കാറ്റ്'), 'കാറ്റ്')
63+
self.assertEqual(normalize('മീറ്ററിൽ'), 'മീറ്ററിൽ')
64+
self.assertEqual(normalize('പിഡബ്ള്യൂഡി'), 'പിഡബ്ള്യൂഡി')
65+
self.assertEqual(normalize('ബാറ്റ്സ്മാൻ'), 'ബാറ്റ്സ്മാൻ')
6366
self.assertEqual(normalize('അൽഭുതം അത്ഭുതം ചികിൽസാപിഴവ്', remove_punctuations=False), "അദ്ഭുതം അദ്ഭുതം ചികിത്സാപിഴവ്")
6467
self.assertEqual(normalize('ദു:ഖത്തിന്റെ–'), 'ദുഃഖത്തിന്റെ')
6568
self.assertEqual(normalize('ദു:ഖത്തിന്റെ-', remove_punctuations=False),

0 commit comments

Comments
 (0)