File size: 1,450 Bytes
f745baf
26197e0
 
 
f745baf
 
 
 
 
 
 
 
26197e0
 
 
 
 
f745baf
 
 
 
 
 
 
 
 
 
26197e0
 
f745baf
 
 
 
 
26197e0
 
 
 
 
 
 
 
 
 
f745baf
26197e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import concurrent.futures
from extractors.model import LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel, AnyParserModel

DEFAULT_TIMEOUT = 30

ap_rt = AnyParserModel()
lp = LlamaParseModel()
un = UnstructuredModel()
gpt = GPTModel()
claude = ClaudeModel()

model_function_map = {
    "AnyParser": ap_rt.run,
    "LlamaParse": lp.run,
    "Unstructured": un.run,
    "GPT-4o-mini": gpt.run,
    "Claude-3.5-Sonnet": claude.run,
}

models = [key for key in model_function_map]

def run_extract(model, file_path):
    print('Running extract: model', model, 'file_path', file_path)
    extractor = model_function_map[model]
    markdown = extractor(file_path)
    return markdown


def run_extract_parallel(model_a, model_b, pdf, timeout=DEFAULT_TIMEOUT):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks to the executor for parallel execution
        future_a = executor.submit(run_extract, model_a, pdf)
        future_b = executor.submit(run_extract, model_b, pdf)

        try:
            # Get the results with a timeout
            result_a = future_a.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            result_a = f"Error: Timeout after {timeout} seconds"

        try:
            result_b = future_b.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            result_b = f"Error: Timeout after {timeout} seconds"

    return result_a, result_b