Elron commited on
Commit
7e6fc99
·
1 Parent(s): 8779e08

Upload text_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. text_utils.py +55 -90
text_utils.py CHANGED
@@ -2,6 +2,15 @@ import re
2
 
3
 
4
  def split_words(s):
 
 
 
 
 
 
 
 
 
5
  # Split PascalCase or camelCase
6
  s = re.sub("([A-Z][a-z]+)", r" \1", re.sub("([A-Z]+)", r" \1", s)).strip()
7
  # Split snake_case or kebab-case
@@ -15,16 +24,41 @@ def split_words(s):
15
 
16
 
17
  def is_camel_case(s):
18
- # The string must start with an uppercase letter, followed by zero or more sequences of an uppercase letter followed by zero or more lowercase letters.
 
 
 
 
 
 
 
 
19
  return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None
20
 
21
 
22
  def is_snake_case(s):
23
- # The string must start with a lowercase letter, followed by zero or more sequences of an underscore followed by one or more lowercase letters.
 
 
 
 
 
 
 
 
24
  return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None
25
 
26
 
27
  def camel_to_snake_case(s):
 
 
 
 
 
 
 
 
 
28
  # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
29
  s = re.sub("(?<=[^A-Z_-])([A-Z])", r"_\1", s)
30
 
@@ -39,6 +73,15 @@ import shutil
39
 
40
 
41
  def print_dict(d, indent=0, indent_delta=4, max_chars=None):
 
 
 
 
 
 
 
 
 
42
  max_chars = max_chars or shutil.get_terminal_size()[0] - 10 # Get terminal size if max_chars not set
43
  indent_str = " " * indent
44
  indent_delta_str = " " * indent_delta
@@ -67,97 +110,19 @@ def print_dict(d, indent=0, indent_delta=4, max_chars=None):
67
 
68
 
69
  def nested_tuple_to_string(nested_tuple: tuple) -> str:
 
 
 
 
 
 
 
 
 
70
  result = []
71
  for item in nested_tuple:
72
  if isinstance(item, tuple):
73
  result.append(nested_tuple_to_string(item))
74
  else:
75
  result.append(str(item))
76
- return "_".join(result)
77
-
78
-
79
- if __name__ == "__main__":
80
- # Define test cases
81
- test_cases = [
82
- ("example1", ["example", "1"]),
83
- ("exampleOne", ["example", "One"]),
84
- ("123example456", ["123", "example", "456"]),
85
- ("happyDay", ["happy", "Day"]),
86
- ("thisIsATest", ["this", "Is", "A", "Test"]),
87
- ("TestAI2023", ["Test", "AI", "2023"]),
88
- ("stringWith1Number", ["string", "With", "1", "Number"]),
89
- ("camelCaseExample", ["camel", "Case", "Example"]),
90
- ("snake_case_example", ["snake", "case", "example"]),
91
- ("snake_case2example3", ["snake", "case", "2", "example", "3"]),
92
- ("kebab-case-example", ["kebab", "case", "example"]),
93
- ("kebab-case2example3", ["kebab", "case", "2", "example", "3"]),
94
- ("PascalCaseExample", ["Pascal", "Case", "Example"]),
95
- ("Title Case Example", ["Title", "Case", "Example"]),
96
- ("Mixed1Example_case", ["Mixed", "1", "Example", "case"]),
97
- ("Mixed2Example-case", ["Mixed", "2", "Example", "case"]),
98
- ("Mixed3_Example-case", ["Mixed", "3", "Example", "case"]),
99
- ("UPPERCASEEXAMPLE", ["UPPERCASEEXAMPLE"]),
100
- ("lowercaseexample", ["lowercaseexample"]),
101
- ("mixedUPanddown", ["mixed", "U", "Panddown"]),
102
- ]
103
-
104
- # Loop through test cases
105
- for i, (input_string, expected_output) in enumerate(test_cases, 1):
106
- # Apply function and check result
107
- if split_words(input_string) != expected_output:
108
- print(f"Failed on example {i}: {input_string}")
109
- print(f"Expected: {expected_output}, but got: {split_words(input_string)}\n")
110
-
111
- is_camel_case_test_cases = [
112
- ("isCamelCase", False),
113
- ("notCamelCase", False),
114
- ("camelCase", False),
115
- ("Notcamelcase", True),
116
- ("camel_Case", False),
117
- ("camelCase123", False),
118
- ("camelcase", False),
119
- ("CAMELCASE", True),
120
- ("camel-case", False),
121
- ("HFLoader", True),
122
- ]
123
-
124
- for input_string, expected_output in is_camel_case_test_cases:
125
- if is_camel_case(input_string) != expected_output:
126
- print(f"Failed on is_camel_case: {input_string}")
127
- print(f"Expected: {expected_output}, but got: {is_camel_case(input_string)}\n")
128
-
129
- is_snake_case_test_cases = [
130
- ("is_snake_case", True),
131
- ("Not_snake_case", False),
132
- ("snake_case", True),
133
- ("snake_Case", False),
134
- ("Snakecase", False),
135
- ("snake-case", False),
136
- ("snake_case123", True),
137
- ("123snake_case", True),
138
- ("snakecase", True),
139
- ]
140
-
141
- for input_string, expected_output in is_snake_case_test_cases:
142
- if is_snake_case(input_string) != expected_output:
143
- print(f"Failed on is_snake_case: {input_string}")
144
- print(f"Expected: {expected_output}, but got: {is_snake_case(input_string)}\n")
145
-
146
- camel_to_snake_case_test_cases = [
147
- ("camelToSnake", "camel_to_snake"),
148
- ("CamelToSnake", "camel_to_snake"),
149
- ("CamelToSnakeCase", "camel_to_snake_case"),
150
- ("camelToSnakeCase123", "camel_to_snake_case123"),
151
- ("123CamelToSnakeCase", "123_camel_to_snake_case"),
152
- ("camelTo_Snake_Case", "camel_to__snake__case"),
153
- ("camelTo-Snake-Case", "camel_to-_snake-_case"),
154
- ("camelToSnakeCASE", "camel_to_snake_case"),
155
- ("CAMELToSnakeCase", "camel_to_snake_case"),
156
- ("camelToSNAKECase", "camel_to_snake_case"),
157
- ("HFLoader", "hf_loader"),
158
- ]
159
-
160
- for input_string, expected_output in camel_to_snake_case_test_cases:
161
- if camel_to_snake_case(input_string) != expected_output:
162
- print(f"Failed on camel_to_snake_case: {input_string}")
163
- print(f"Expected: {expected_output}, but got: {camel_to_snake_case(input_string)}\n")
 
2
 
3
 
4
  def split_words(s):
5
+ """
6
+ Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
7
+
8
+ Args:
9
+ s (str): The string to be split.
10
+
11
+ Returns:
12
+ list: The list of words obtained after splitting the string.
13
+ """
14
  # Split PascalCase or camelCase
15
  s = re.sub("([A-Z][a-z]+)", r" \1", re.sub("([A-Z]+)", r" \1", s)).strip()
16
  # Split snake_case or kebab-case
 
24
 
25
 
26
  def is_camel_case(s):
27
+ """
28
+ Checks if a string is in camelCase.
29
+
30
+ Args:
31
+ s (str): The string to be checked.
32
+
33
+ Returns:
34
+ bool: True if the string is in camelCase, False otherwise.
35
+ """
36
  return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None
37
 
38
 
39
  def is_snake_case(s):
40
+ """
41
+ Checks if a string is in snake_case.
42
+
43
+ Args:
44
+ s (str): The string to be checked.
45
+
46
+ Returns:
47
+ bool: True if the string is in snake_case, False otherwise.
48
+ """
49
  return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None
50
 
51
 
52
  def camel_to_snake_case(s):
53
+ """
54
+ Converts a string from camelCase to snake_case.
55
+
56
+ Args:
57
+ s (str): The string to be converted.
58
+
59
+ Returns:
60
+ str: The string converted to snake_case.
61
+ """
62
  # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
63
  s = re.sub("(?<=[^A-Z_-])([A-Z])", r"_\1", s)
64
 
 
73
 
74
 
75
  def print_dict(d, indent=0, indent_delta=4, max_chars=None):
76
+ """
77
+ Prints a dictionary in a formatted manner, taking into account the terminal width.
78
+
79
+ Args:
80
+ d (dict): The dictionary to be printed.
81
+ indent (int, optional): The current level of indentation. Defaults to 0.
82
+ indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 4.
83
+ max_chars (int, optional): The maximum number of characters for each line. Defaults to terminal width - 10.
84
+ """
85
  max_chars = max_chars or shutil.get_terminal_size()[0] - 10 # Get terminal size if max_chars not set
86
  indent_str = " " * indent
87
  indent_delta_str = " " * indent_delta
 
110
 
111
 
112
  def nested_tuple_to_string(nested_tuple: tuple) -> str:
113
+ """
114
+ Converts a nested tuple to a string, with elements separated by underscores.
115
+
116
+ Args:
117
+ nested_tuple (tuple): The nested tuple to be converted.
118
+
119
+ Returns:
120
+ str: The string representation of the nested tuple.
121
+ """
122
  result = []
123
  for item in nested_tuple:
124
  if isinstance(item, tuple):
125
  result.append(nested_tuple_to_string(item))
126
  else:
127
  result.append(str(item))
128
+ return "_".join(result)