|
41 | 41 |
|
42 | 42 | VALIDATE = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT_VALIDATE', False) |
43 | 43 |
|
| 44 | +# Pattern to match Unicode surrogate characters (U+D800-U+DFFF) that can cause |
| 45 | +# false positive "(c)" copyright detections when decoded improperly |
| 46 | +SURROGATE_PATTERN = re.compile(r'[\uD800-\uDFFF]') |
| 47 | + |
44 | 48 |
|
45 | 49 | # Tracing flags |
46 | 50 | def logger_debug(*args): |
@@ -83,6 +87,24 @@ def logger_debug(*args): |
83 | 87 | """ |
84 | 88 |
|
85 | 89 |
|
| 90 | +def sanitize_line_for_detection(text): |
| 91 | + """ |
| 92 | + Sanitize a line of text to prevent false positive copyright detections. |
| 93 | + |
| 94 | + Remove Unicode surrogate characters (U+D800-U+DFFF) which can be |
| 95 | + misinterpreted as "(c)" copyright symbols when improperly decoded, |
| 96 | + causing noise in copyright detection results. |
| 97 | + |
| 98 | + For example, surrogate characters in files like unicode_full-bmp.txt |
| 99 | + were incorrectly detected as: "copyright: (c) $?i (c) Y" |
| 100 | + |
| 101 | + See: https://github.com/nexB/scancode-toolkit/issues/4381 |
| 102 | + """ |
| 103 | + if not text: |
| 104 | + return text |
| 105 | + return SURROGATE_PATTERN.sub('', text) |
| 106 | + |
| 107 | + |
86 | 108 | def detect_copyrights( |
87 | 109 | location, |
88 | 110 | include_copyrights=True, |
@@ -174,6 +196,12 @@ def detect_copyrights_from_lines( |
174 | 196 | if not numbered_lines: |
175 | 197 | return |
176 | 198 |
|
| 199 | + # Sanitize lines to remove surrogate characters that cause false positives |
| 200 | + numbered_lines = [ |
| 201 | + (line_num, sanitize_line_for_detection(text)) |
| 202 | + for line_num, text in numbered_lines |
| 203 | + ] |
| 204 | + |
177 | 205 | include_copyright_years = include_copyrights and include_copyright_years |
178 | 206 | include_copyright_allrights = include_copyrights and include_copyright_allrights |
179 | 207 |
|
@@ -1251,7 +1279,7 @@ def build_detection_from_node( |
1251 | 1279 | (r'^Comment[A-Z]', 'JUNK'), |
1252 | 1280 | (r'^fall$', 'JUNK'), |
1253 | 1281 | (r'^[Aa]nother$', 'JUNK'), |
1254 | | - (r'^[Aa]acute', 'JUNK'), |
| 1282 | + (r'^[Aa]cute', 'JUNK'), |
1255 | 1283 | (r'^[Aa]circumflex', 'JUNK'), |
1256 | 1284 | (r'^[Kk]eywords?', 'JUNK'), |
1257 | 1285 | (r'^comparing$', 'JUNK'), |
@@ -1481,7 +1509,6 @@ def build_detection_from_node( |
1481 | 1509 | (r'^Port$', 'NN'), |
1482 | 1510 | (r'^GnuPG$', 'NN'), |
1483 | 1511 | (r'^Government.', 'NNP'), |
1484 | | - (r'^OProfile$', 'NNP'), |
1485 | 1512 | (r'^Government$', 'COMP'), |
1486 | 1513 | # there is a Ms. Grant |
1487 | 1514 | (r'^Grant$', 'NNP'), |
@@ -2276,12 +2303,6 @@ def build_detection_from_node( |
2276 | 2303 | # URLS such as <(http://fedorahosted.org/lohit)> or () |
2277 | 2304 | (r'[<\(]https?:.*[>\)]', 'URL'), |
2278 | 2305 | # URLS such as ibm.com without a scheme |
2279 | | - (r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'), |
2280 | | - # TODO: add more extensions: there are so many TLDs these days! |
2281 | | - # URL wrapped in () or <> |
2282 | | - (r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'), |
2283 | | - (r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'), |
2284 | | - # derived from regex in cluecode.finder |
2285 | 2306 | (r'<?a?.(href)?.(' |
2286 | 2307 | r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+' |
2287 | 2308 | r'|(?:www|ftp)\.[^\s<>\[\]"]+' |
@@ -2902,7 +2923,7 @@ def build_detection_from_node( |
2902 | 2923 |
|
2903 | 2924 | # Gracenote, Inc., copyright © 2000-2008 Gracenote. |
2904 | 2925 | # Gracenote Software, copyright © 2000-2008 Gracenote. |
2905 | | - # COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12 |
| 2926 | + COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12 |
2906 | 2927 |
|
2907 | 2928 | # Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995. |
2908 | 2929 | COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999 |
@@ -3083,6 +3104,7 @@ def build_detection_from_node( |
3083 | 3104 |
|
3084 | 3105 | COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <NN|CAPS>* <COMPANY>?} #2300 |
3085 | 3106 |
|
| 3107 | + # Copyright (c) 2014, 2015, the respective contributors All rights reserved |
3086 | 3108 | # Copyright (c) 2014, 2015, the respective contributors All rights reserved. |
3087 | 3109 | COPYRIGHT: {<COPYRIGHT|COPYRIGHT2> <NN|NNP|CONTRIBUTORS>+ <ALLRIGHTRESERVED>} #2862 |
3088 | 3110 |
|
|
0 commit comments