Upload formats.py with huggingface_hub
Browse files- formats.py +44 -2
formats.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import (
|
2 |
Any,
|
3 |
Dict,
|
@@ -14,9 +15,51 @@ class Format(StreamInstanceOperator):
|
|
14 |
pass
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
class SystemFormat(Format):
|
18 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
19 |
|
|
|
|
|
20 |
SystemFormat expects the input instance to contain:
|
21 |
1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
|
22 |
2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
|
@@ -107,7 +150,6 @@ class SystemFormat(Format):
|
|
107 |
instance=instance, field_name="system_prompt"
|
108 |
)
|
109 |
|
110 |
-
# pop "system_prompt", "instruction", and "target_prefix" from instance
|
111 |
if "target_prefix" in instance:
|
112 |
instance.pop("target_prefix")
|
113 |
if "instruction" in instance:
|
@@ -122,7 +164,6 @@ class SystemFormat(Format):
|
|
122 |
demos is not None and isoftype(demos, List[Dict[str, Any]])
|
123 |
), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
|
124 |
demo_instances = demos
|
125 |
-
# pop demos from instance
|
126 |
instance.pop(self.demos_field)
|
127 |
|
128 |
demos_string = ""
|
@@ -143,5 +184,6 @@ class SystemFormat(Format):
|
|
143 |
target_prefix=target_prefix,
|
144 |
**self.format_args,
|
145 |
)
|
|
|
146 |
instance["source"] = output
|
147 |
return instance
|
|
|
1 |
+
import re
|
2 |
from typing import (
|
3 |
Any,
|
4 |
Dict,
|
|
|
15 |
pass
|
16 |
|
17 |
|
18 |
+
def apply_capital_new_line_notation(text: str) -> str:
|
19 |
+
r"""Transforms a given string by applying the Capital New Line Notation.
|
20 |
+
|
21 |
+
The Capital New Line Notation (\N) is designed to manage newline behavior in a string efficiently.
|
22 |
+
This custom notation aims to consolidate multiple newline characters (\n) into a single newline under
|
23 |
+
specific conditions, with tailored handling based on whether there's preceding text. The function
|
24 |
+
distinguishes between two primary scenarios:
|
25 |
+
|
26 |
+
1. If there's text (referred to as a prefix) followed by any number of \n characters and then one or
|
27 |
+
more \N, the entire sequence is replaced with a single \n. This effectively simplifies multiple
|
28 |
+
newlines and notation characters into a single newline when there's preceding text.
|
29 |
+
2. If the string starts with \n characters followed by \N without any text before this sequence, or if
|
30 |
+
\N is at the very beginning of the string, the sequence is completely removed. This case is
|
31 |
+
applicable when the notation should not introduce any newlines due to the absence of preceding text.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
text (str): The input string to be transformed, potentially containing the Capital New Line Notation
|
35 |
+
(\N) mixed with actual newline characters (\n).
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
str: The string after applying the Capital New Line Notation rules, which either consolidates multiple
|
39 |
+
newlines and notation characters into a single newline when text precedes them, or removes the
|
40 |
+
notation and any preceding newlines entirely if no text is present before the notation.
|
41 |
+
|
42 |
+
Examples:
|
43 |
+
>>> apply_capital_new_line_notation("Hello World\\n\\n\N")
|
44 |
+
'Hello World\\n'
|
45 |
+
|
46 |
+
>>> apply_capital_new_line_notation("\\n\\n\NGoodbye World")
|
47 |
+
'Goodbye World'
|
48 |
+
|
49 |
+
>>> apply_capital_new_line_notation("\N")
|
50 |
+
''
|
51 |
+
"""
|
52 |
+
# If sequence of \N or \n that ends with \N has no characters before delete it
|
53 |
+
text = re.sub(r"^(?:\n|\\N)*\\N", "", text)
|
54 |
+
# Replace every sequence of \N or \n that ends with \N with \n
|
55 |
+
return re.sub(r"[\n(\\N)]*(\\N)+", r"\n", text)
|
56 |
+
|
57 |
+
|
58 |
class SystemFormat(Format):
|
59 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
60 |
|
61 |
+
Important: formats can use '\N' notations that means new-line if no new-line before and no empty string before.
|
62 |
+
|
63 |
SystemFormat expects the input instance to contain:
|
64 |
1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
|
65 |
2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
|
|
|
150 |
instance=instance, field_name="system_prompt"
|
151 |
)
|
152 |
|
|
|
153 |
if "target_prefix" in instance:
|
154 |
instance.pop("target_prefix")
|
155 |
if "instruction" in instance:
|
|
|
164 |
demos is not None and isoftype(demos, List[Dict[str, Any]])
|
165 |
), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
|
166 |
demo_instances = demos
|
|
|
167 |
instance.pop(self.demos_field)
|
168 |
|
169 |
demos_string = ""
|
|
|
184 |
target_prefix=target_prefix,
|
185 |
**self.format_args,
|
186 |
)
|
187 |
+
output = apply_capital_new_line_notation(output)
|
188 |
instance["source"] = output
|
189 |
return instance
|