зеркало из
				https://github.com/ssciwr/AMMICO.git
				synced 2025-10-30 13:36:04 +02:00 
			
		
		
		
	improve handling of exceptions for googletrans (#244)
* truncate text to max length for googletrans, skip translation in case of error * restrict peft version for backwards compatibility with old transformers required by lavis
Этот коммит содержится в:
		
							родитель
							
								
									e12929a909
								
							
						
					
					
						Коммит
						3cf1b5466a
					
				| @ -154,6 +154,16 @@ def test_check_add_space_after_full_stop(accepted): | |||||||
|     assert test_obj.subdict["text"] == "www. icanhascheezburger. com" |     assert test_obj.subdict["text"] == "www. icanhascheezburger. com" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_truncate_text(accepted): | ||||||
|  |     test_obj = tt.TextDetector({}, accept_privacy=accepted) | ||||||
|  |     test_obj.subdict["text"] = "I like cats and dogs." | ||||||
|  |     test_obj._truncate_text() | ||||||
|  |     assert test_obj.subdict["text"] == "I like cats and dogs." | ||||||
|  |     test_obj.subdict["text"] = 20000 * "m" | ||||||
|  |     test_obj._truncate_text() | ||||||
|  |     assert test_obj.subdict["text"] == 5000 * "m" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.mark.gcv | @pytest.mark.gcv | ||||||
| def test_analyse_image(set_testdict, set_environ, accepted): | def test_analyse_image(set_testdict, set_environ, accepted): | ||||||
|     for item in set_testdict: |     for item in set_testdict: | ||||||
|  | |||||||
| @ -112,7 +112,7 @@ class TextDetector(AnalysisMethod): | |||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 "Privacy disclosure not accepted - skipping text detection." |                 "Privacy disclosure not accepted - skipping text detection." | ||||||
|             ) |             ) | ||||||
|         self.translator = Translator() |         self.translator = Translator(raise_exception=True) | ||||||
|         if not isinstance(analyse_text, bool): |         if not isinstance(analyse_text, bool): | ||||||
|             raise ValueError("analyse_text needs to be set to true or false") |             raise ValueError("analyse_text needs to be set to true or false") | ||||||
|         self.analyse_text = analyse_text |         self.analyse_text = analyse_text | ||||||
| @ -259,6 +259,12 @@ class TextDetector(AnalysisMethod): | |||||||
|             ) |             ) | ||||||
|             add_one += 1 |             add_one += 1 | ||||||
| 
 | 
 | ||||||
|  |     def _truncate_text(self, max_length: int = 5000) -> str: | ||||||
|  |         """Truncate the text if it is too long for googletrans.""" | ||||||
|  |         if self.subdict["text"] and len(self.subdict["text"]) > max_length: | ||||||
|  |             print("Text is too long - truncating to {} characters.".format(max_length)) | ||||||
|  |             self.subdict["text"] = self.subdict["text"][:max_length] | ||||||
|  | 
 | ||||||
|     def analyse_image(self) -> dict: |     def analyse_image(self) -> dict: | ||||||
|         """Perform text extraction and analysis of the text. |         """Perform text extraction and analysis of the text. | ||||||
| 
 | 
 | ||||||
| @ -274,6 +280,7 @@ class TextDetector(AnalysisMethod): | |||||||
|             # make sure all full stops are followed by whitespace |             # make sure all full stops are followed by whitespace | ||||||
|             # otherwise googletrans breaks |             # otherwise googletrans breaks | ||||||
|             self._check_add_space_after_full_stop() |             self._check_add_space_after_full_stop() | ||||||
|  |             self._truncate_text() | ||||||
|             self.translate_text() |             self.translate_text() | ||||||
|             self.remove_linebreaks() |             self.remove_linebreaks() | ||||||
|             if self.analyse_text: |             if self.analyse_text: | ||||||
| @ -329,13 +336,18 @@ class TextDetector(AnalysisMethod): | |||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 "Privacy disclosure not accepted - skipping text translation." |                 "Privacy disclosure not accepted - skipping text translation." | ||||||
|             ) |             ) | ||||||
|  |         try: | ||||||
|             translated = self.translator.translate(self.subdict["text"]) |             translated = self.translator.translate(self.subdict["text"]) | ||||||
|         self.subdict["text_language"] = translated.src |         except Exception: | ||||||
|         self.subdict["text_english"] = translated.text |             print("Could not translate the text with error {}.".format(Exception)) | ||||||
|  |             translated = None | ||||||
|  |             print("Skipping translation for this text.") | ||||||
|  |         self.subdict["text_language"] = translated.src if translated else None | ||||||
|  |         self.subdict["text_english"] = translated.text if translated else None | ||||||
| 
 | 
 | ||||||
|     def remove_linebreaks(self): |     def remove_linebreaks(self): | ||||||
|         """Remove linebreaks from original and translated text.""" |         """Remove linebreaks from original and translated text.""" | ||||||
|         if self.subdict["text"]: |         if self.subdict["text"] and self.subdict["text_english"]: | ||||||
|             self.subdict["text"] = self.subdict["text"].replace("\n", " ") |             self.subdict["text"] = self.subdict["text"].replace("\n", " ") | ||||||
|             self.subdict["text_english"] = self.subdict["text_english"].replace( |             self.subdict["text_english"] = self.subdict["text_english"].replace( | ||||||
|                 "\n", " " |                 "\n", " " | ||||||
|  | |||||||
| @ -35,6 +35,7 @@ dependencies = [ | |||||||
|     "matplotlib", |     "matplotlib", | ||||||
|     "numpy<=1.23.4", |     "numpy<=1.23.4", | ||||||
|     "pandas", |     "pandas", | ||||||
|  |     "peft<=0.13.0", | ||||||
|     "Pillow", |     "Pillow", | ||||||
|     "pooch", |     "pooch", | ||||||
|     "protobuf", |     "protobuf", | ||||||
|  | |||||||
		Загрузка…
	
	
			
			x
			
			
		
	
		Ссылка в новой задаче
	
	Block a user
	 Inga Ulusoy
						Inga Ulusoy