Skip to content

Commit

Permalink
Refactor utf-8 check
Browse files Browse the repository at this point in the history
  • Loading branch information
BolunThompson committed Dec 12, 2024
1 parent dcf1e32 commit ea94fec
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions shasta/bash_to_shasta_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,21 +376,26 @@ def split_utf8(word: bytes) -> list[bytes]:
i = 0
while i < len(word):
for j in range(1, 5): # UTF-8 characters can be between 1 and 4 bytes long
try:
# Attempt to decode the next 1-4 bytes
char = word[i : i + j].decode("utf-8")
# Attempt to decode the next 1-4 bytes
if valid_utf8(word[i : i + j]):
split_bytes.append(word[i : i + j])
i += j # Move past the successfully decoded character
break
except UnicodeDecodeError:
else:
if (
j == 4
): # If we've reached 4 bytes without success, it's an invalid sequence
split_bytes.append(word[i : i + 1])
i += 1 # Move past the invalid byte
return split_bytes


def valid_utf8(str: bytes) -> bool:
try:
str.decode("utf-8")
except UnicodeDecodeError:
return False
return True

def to_arg_char_string(word: str) -> list[ArgChar]:
return to_arg_char_bytes(word.encode("utf-8"), [])

Expand Down

0 comments on commit ea94fec

Please sign in to comment.