Skip to content

Commit ad58736

Browse files
authored
MVP version of combi-search implemented (#13)
1 parent f6958a2 commit ad58736

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

GenEC/core/analyze.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class ConfigOptions(Enum):
2828
class TextFilterTypes(Enum):
2929
REGEX = 'Regex'
3030
POSITIONAL = 'Positional'
31+
COMBI_SEARCH = 'Combi-search'
3132
KEYWORD = 'Keyword_UNSUPPORTED'
3233
SPLIT_KEYWORDS = 'Split-keywords_UNSUPPORTED'
3334

@@ -121,6 +122,15 @@ def request_text_filter(self):
121122
'line': int(self.ask_open_question('Please provide the line number in the cluster: ')),
122123
'occurrence': int(self.ask_open_question('Please provide the occurrence number: '))}
123124
return positional_text_filter
125+
elif self.config.get(ConfigOptions.TEXT_FILTER_TYPE.value) == TextFilterTypes.COMBI_SEARCH.value:
126+
combi_search_filters = []
127+
index = 1
128+
while True:
129+
combi_search_filters.append(self.ask_open_question('Please provide a regex filter for search {0}: '.format(index)))
130+
index += 1
131+
if self.ask_open_question('Do you wish to provide a next search parameter [yes/y]: ').lower() not in YES_INPUT:
132+
break
133+
return combi_search_filters
124134
else:
125135
raise ValueError('Unsupported filter type: %s' % self.config.get(ConfigOptions.TEXT_FILTER_TYPE.value))
126136

@@ -174,6 +184,8 @@ def extract_from_data(self, data, file):
174184
return self.extract_text_from_clusters_by_regex(clusters)
175185
elif self.config.get(ConfigOptions.TEXT_FILTER_TYPE.value) == TextFilterTypes.POSITIONAL.value:
176186
return self.extract_text_from_clusters_by_position(clusters)
187+
elif self.config.get(ConfigOptions.TEXT_FILTER_TYPE.value) == TextFilterTypes.COMBI_SEARCH.value:
188+
return self.extract_text_from_clusters_by_combi_search(clusters)
177189
else:
178190
raise ValueError("Unsupported filter type: %s" % self.config.get(ConfigOptions.TEXT_FILTER_TYPE.value))
179191

@@ -205,10 +217,11 @@ def get_sliced_clusters(self, clusters, start_keyword='', end_keyword=''):
205217

206218
return (clusters[start_cluster_index:end_cluster_index+1])
207219

208-
def extract_text_from_clusters_by_regex(self, clusters):
220+
def extract_text_from_clusters_by_regex(self, clusters, regex_pattern=None):
209221
filtered_text = []
222+
pattern = re.compile(regex_pattern if regex_pattern else self.config.get(ConfigOptions.TEXT_FILTER.value))
210223
for cluster in clusters:
211-
search_result = re.search(self.config.get(ConfigOptions.TEXT_FILTER.value), cluster)
224+
search_result = pattern.search(cluster)
212225
if search_result is not None:
213226
filtered_text.append(search_result.group(1))
214227
return filtered_text
@@ -224,6 +237,14 @@ def extract_text_from_clusters_by_position(self, clusters):
224237
continue
225238
return filtered_text
226239

240+
def extract_text_from_clusters_by_combi_search(self, clusters):
241+
'''Combi-search executes multiple user-defined regex searches, isolating only the relevant clusters for the final search'''
242+
filters = self.config.get(ConfigOptions.TEXT_FILTER.value)
243+
for filter in filters[:-1]:
244+
pattern = re.compile(filter)
245+
clusters = [cluster for cluster in clusters if pattern.search(cluster)]
246+
return self.extract_text_from_clusters_by_regex(clusters, filters[-1])
247+
227248

228249
class Comparer:
229250
def __init__(self, source, reference):

tests/unit_tests/test_extractor.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,37 @@ def test_extract_from_data_by_position(mock_extract_text_from_clusters_by_positi
6767
assert extractor_instance.extract_from_data('', Files.SOURCE.value) == ['my_result']
6868

6969

70+
@patch.object(Extractor, 'extract_text_from_clusters_by_combi_search')
71+
def test_extract_from_data_by_combi_search(mock_extract_text_from_clusters_by_combi_search, extractor_instance):
72+
mock_extract_text_from_clusters_by_combi_search.return_value = ['my_result']
73+
extractor_instance.config[ConfigOptions.TEXT_FILTER_TYPE.value] = TextFilterTypes.COMBI_SEARCH.value
74+
assert extractor_instance.extract_from_data('', Files.SOURCE.value) == ['my_result']
75+
76+
77+
@pytest.mark.parametrize(
78+
'clusters, regex_filters, expected_filtered_clusters',
79+
[
80+
# Case 1: Some clusters remain after filtering
81+
(['abc123', 'xyz89', 'test456', 'hello111', 'finalTest'],
82+
[r'\d{3}', r'test'],
83+
['abc123', 'test456', 'hello111']),
84+
# Case 2: More restrictive filtering, leaving only 'test456'
85+
(['abc123', 'xyz89', 'test456', 'hello111', 'finalTest'],
86+
[r'\d{3}', r'test', r'final'],
87+
['test456']),
88+
# Case 3: All clusters are removed by the regex filters, leaving an empty list
89+
(['abc123', 'xyz89', 'test456', 'hello111', 'finalTest'],
90+
[r'^z.*$', r'test'], # First filter keeps only words starting with 'z'
91+
[])
92+
]
93+
)
94+
@patch.object(Extractor, 'extract_text_from_clusters_by_regex')
95+
def test_extract_text_from_clusters_by_combi_search(mock_extract_text, extractor_instance, clusters, regex_filters, expected_filtered_clusters):
96+
extractor_instance.config[ConfigOptions.TEXT_FILTER.value] = regex_filters
97+
extractor_instance.extract_text_from_clusters_by_combi_search(clusters)
98+
mock_extract_text.assert_called_once_with(expected_filtered_clusters, regex_filters[-1])
99+
100+
70101
def test_extract_from_data_unsupported_filter_type(extractor_instance):
71102
data = 'saiucdjh1\ndusi2hiuw\n3134ferw\n4waijc\ndjhe56fk7\niuaijaudc'
72103
extractor_instance.config[ConfigOptions.CLUSTER_FILTER.value] = '\n'

tests/unit_tests/test_input_manager.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,18 @@ def test_request_POSITIONAL_filter_type(mock_input, mock_side_effect, mock_outpu
242242
assert im_instance.request_text_filter() == {'separator': mock_output[0], 'line': mock_output[1], 'occurrence': mock_output[2]}
243243

244244

245+
@pytest.mark.parametrize('mock_side_effect, mock_output', [
246+
(['regex_1', 'y', 'regex_2', 'done'], ['regex_1', 'regex_2']),
247+
(['regex_1', 'n'], ['regex_1']),
248+
(['regex_1', 'Y', 'regex_2', 'YeS', 'regex_3', ''], ['regex_1', 'regex_2', 'regex_3']),
249+
])
250+
@patch.object(InputManager, 'ask_open_question')
251+
def test_request_COMBI_SEARCH_filter_type(mock_input, mock_side_effect, mock_output, im_instance):
252+
mock_input.side_effect = mock_side_effect
253+
im_instance.config[ConfigOptions.TEXT_FILTER_TYPE.value] = TextFilterTypes.COMBI_SEARCH.value
254+
assert im_instance.request_text_filter() == mock_output
255+
256+
245257
@pytest.mark.parametrize('filter_type', [
246258
(TextFilterTypes.KEYWORD.value),
247259
(TextFilterTypes.SPLIT_KEYWORDS.value)])

0 commit comments

Comments
 (0)