jjz5463 commited on
Commit
78263be
·
1 Parent(s): abcf835

pipeline build up

Browse files
Experiments/Baseline-Experimental.ipynb DELETED
@@ -1,85 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "initial_id",
7
- "metadata": {
8
- "collapsed": true
9
- },
10
- "outputs": [],
11
- "source": [
12
- "# Utilize the Google Cloud Vision API to recognize text in the input images (diary images), https://cloud.google.com/vision.\n"
13
- ]
14
- },
15
- {
16
- "cell_type": "code",
17
- "execution_count": null,
18
- "outputs": [],
19
- "source": [
20
- "# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.\n"
21
- ],
22
- "metadata": {
23
- "collapsed": false
24
- },
25
- "id": "204930c13fd1e579"
26
- },
27
- {
28
- "cell_type": "code",
29
- "execution_count": null,
30
- "outputs": [],
31
- "source": [
32
- "# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer, and output a textual description of the image, https://ai.google.dev/gemini-api/docs/models/gemini.\n"
33
- ],
34
- "metadata": {
35
- "collapsed": false
36
- },
37
- "id": "7f0c7d788b8de177"
38
- },
39
- {
40
- "cell_type": "code",
41
- "execution_count": null,
42
- "outputs": [],
43
- "source": [
44
- "# Now that you have text from the diary and text describing the diary writer, you can utilize the SDXL-Turbo stable diffusion model to generate images https://huggingface.co/stabilityai/sdxl-turbo. You can try to output several images for a diary entry. Analyze how accurate the results, and think about what could be improved.\n"
45
- ],
46
- "metadata": {
47
- "collapsed": false
48
- },
49
- "id": "c475ca58dea760da"
50
- },
51
- {
52
- "cell_type": "code",
53
- "execution_count": null,
54
- "outputs": [],
55
- "source": [
56
- "# You can create a web or mobile-based GUI so that users can experience your solution. Suggested libraries include https://www.gradio.app/ or https://streamlit.io/.\n"
57
- ],
58
- "metadata": {
59
- "collapsed": false
60
- },
61
- "id": "ee3a3a8d4027bae3"
62
- }
63
- ],
64
- "metadata": {
65
- "kernelspec": {
66
- "display_name": "Python 3",
67
- "language": "python",
68
- "name": "python3"
69
- },
70
- "language_info": {
71
- "codemirror_mode": {
72
- "name": "ipython",
73
- "version": 2
74
- },
75
- "file_extension": ".py",
76
- "mimetype": "text/x-python",
77
- "name": "python",
78
- "nbconvert_exporter": "python",
79
- "pygments_lexer": "ipython2",
80
- "version": "2.7.6"
81
- }
82
- },
83
- "nbformat": 4,
84
- "nbformat_minor": 5
85
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experiments/Baseline/GUI.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+
4
+ # You can create a web or mobile-based GUI so that users can experience your solution. Suggested libraries include https://www.gradio.app/ or https://streamlit.io/.
5
+ st.title('Handwritten Diary to Cartoon Book')
6
+ uploaded_diary = st.file_uploader("Upload your diary image", type=["png", "jpg", "jpeg"])
7
+ uploaded_writer_image = st.file_uploader("Upload your photo", type=["png", "jpg", "jpeg"])
8
+
9
+ if uploaded_diary and uploaded_writer_image:
10
+ st.write("Analyzing your diary...")
11
+
12
+ diary_text = detect_text_in_image(uploaded_diary)
13
+ summarized_text = summarize_diary_text(diary_text)
14
+
15
+ st.write(f"Summarized Diary Text: {summarized_text}")
16
+
17
+ writer_description = analyze_writer_image(uploaded_writer_image)
18
+ st.write(f"Diary Writer Description: {writer_description}")
19
+
20
+ # Generate cartoon image
21
+ prompt = f"{summarized_text}, featuring a person who {writer_description}"
22
+ generated_image = generate_image(prompt)
23
+
24
+ st.image(generated_image, caption="Generated Cartoon Image")
Experiments/Baseline/baseline.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from baseline_utils import *
2
+ from keys.keys import *
3
+
4
+ diary_image_path = "images/test_sample.jpeg"
5
+ writer_image_path = "images/writer.jpg"
6
+ credentials_path = "keys/service_account_credentials.json"
7
+
8
+ # Detect text from the image using the provided credentials
9
+ detected_text = detect_text_in_image(diary_image_path, credentials_path)
10
+ diary_summary = summarize_diary_text(detected_text, open_ai_keys)
11
+ writer_summary = analyze_writer_image(writer_image_path, gemini_keys)
12
+ generate_image(diary_summary, writer_summary)
Experiments/Baseline/baseline_utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from google.cloud import vision
3
+ from google.oauth2 import service_account
4
+ import io
5
+ import google.generativeai as genai
6
+ from diffusers import AutoPipelineForText2Image
7
+ import torch
8
+
9
+
10
+ # Utilize the Google Cloud Vision API to recognize text in the
11
+ # input images (diary images), https://cloud.google.com/vision.
12
+ def detect_text_in_image(image_path, credentials_path):
13
+ # Load the service account key from the credentials JSON file
14
+ credentials = service_account.Credentials.from_service_account_file(credentials_path)
15
+
16
+ # Create a Vision API client using the credentials
17
+ client = vision.ImageAnnotatorClient(credentials=credentials)
18
+
19
+ # Open the image file
20
+ with io.open(image_path, 'rb') as image_file:
21
+ content = image_file.read()
22
+
23
+ # Create an image object for the Vision API
24
+ image = vision.Image(content=content)
25
+
26
+ # Use the Vision API to detect text
27
+ response = client.text_detection(image=image)
28
+ texts = response.text_annotations
29
+
30
+ # Check for errors in the response
31
+ if response.error.message:
32
+ raise Exception(f'{response.error.message}')
33
+
34
+ # Return the detected text or an empty string
35
+ return texts[0].description if texts else ''
36
+
37
+
38
+ # Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
39
+ # text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
40
+ def summarize_diary_text(text, api_key):
41
+ # Initialize the OpenAI client
42
+ client = openai.Client(api_key=api_key)
43
+
44
+ # Use the client to call the chat completion API
45
+ response = client.chat.completions.create(
46
+ model="gpt-4", # Use GPT-4
47
+ messages=[
48
+ {"role": "system", "content": "You are a helpful assistant."},
49
+ {"role": "user", "content": f"Summarize the following diary entry: {text}"}
50
+ ],
51
+ max_tokens=150,
52
+ temperature=0.7,
53
+ n=1 # Number of completions to generate
54
+ )
55
+
56
+ # Extract the summary from the response
57
+ return response.choices[0].message.content
58
+
59
+
60
+ # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
61
+ # and output a textual description of the image,
62
+ # https://ai.google.dev/gemini-api/docs/models/gemini.
63
+ # Mock example assuming an API request to Gemini
64
+ def analyze_writer_image(image_path, api_key):
65
+ genai.configure(api_key=api_key)
66
+ model = genai.GenerativeModel("gemini-1.5-flash")
67
+ myfile = genai.upload_file(image_path)
68
+ result = model.generate_content(
69
+ [myfile, "\n\n", "Can you give a textual description of the image?"]
70
+ )
71
+ return result.text
72
+
73
+
74
+ # Now that you have text from the diary and text describing the diary writer,
75
+ # you can utilize the SDXL-Turbo stable diffusion model to generate
76
+ # images https://huggingface.co/stabilityai/sdxl-turbo.
77
+ # You can try to output several images for a diary entry. Analyze how accurate the results,
78
+ # and think about what could be improved.
79
+ def generate_image(diary_text, writer_description):
80
+ pipe = AutoPipelineForText2Image.from_pretrained(
81
+ "stabilityai/sdxl-turbo",
82
+ torch_dtype=torch.float16,
83
+ variant="fp16",
84
+ cache_dir="./SDXL-Turbo")
85
+
86
+ # Check for available device: CUDA, MPS, or CPU
87
+ if torch.cuda.is_available():
88
+ device = "cuda"
89
+ print("Using CUDA backend.")
90
+ elif torch.backends.mps.is_available():
91
+ device = "mps"
92
+ print("Using MPS backend.")
93
+ else:
94
+ device = "cpu"
95
+ print("CUDA and MPS not available. Falling back to CPU.")
96
+
97
+ # Move the model to the selected device
98
+ pipe = pipe.to(device)
99
+
100
+ # Generate the image with a simple prompt
101
+ prompt = f'Writer Description: {writer_description} \n\n Diary: {diary_text}'
102
+ print(prompt)
103
+ image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
104
+
105
+ # Save the generated image
106
+ image.save("generated_image.png")
107
+
Experiments/Baseline/images/test_sample.jpeg ADDED
Experiments/Baseline/images/writer.jpg ADDED
Source/requirements.txt DELETED
File without changes
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openai
2
+ google-cloud-vision
3
+ google-auth
4
+ google-generativeai
5
+ diffusers
6
+ torch