In [1]:
from langchain_groq import ChatGroq

In [2]:
llm = ChatGroq(
    temperature=0, 
    groq_api_key='your_api_key_here', 
    model_name="llama-3.1-70b-versatile"
)
# checking the response, and it is very fast
response = llm.invoke("The first person to land on moon was ...")
print(response.content)

The first person to land on the moon was Neil Armstrong. He stepped onto the lunar surface on July 20, 1969, as part of the Apollo 11 mission.


In [3]:
# we need to setup a vector database, and we going to use chromadb
# there are other solutions too, but chromadb is open source and very light weight

In [4]:
# WebBaseLoader will accept the url and extract the data from that, ie web scraping

from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://careers.myntra.com/job-detail/?id=7431200002")
page_data = loader.load().pop().page_content
print(page_data)

USER_AGENT environment variable not set, consider setting it to identify your requests.



































Data Scientist

























About
Alum
Inclusion
Careers
Culture
Blog
Tech










Data Scientist
Bengaluru








Share









Apply



About Team
Myntra Data Science team delivers a large number of data science solutions for the company which are deployed at various customer touch points every quarter. The models create significant revenue and customer experience impact. The models involve real-time, near-real-time and offline solutions with varying latency requirements. The models are built using massive datasets. You will have the opportunity to be part of a rapidly growing organization and gain exposure to all the parts of a comprehensive ecommerce platform. You’ll also get to learn the intricacies of building models that serve millions of requests per second at sub second latency. 
The team takes pride in deploying solutions that not only leverage state of the art machine learning models like graph neural networks, diffusion model

In [5]:
from langchain_core.prompts import PromptTemplate
# (NO PREAMBLE) means dont give that initial text like Here is your response.
prompt_extract = PromptTemplate.from_template(
        """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)

In [6]:
chain_extract = prompt_extract | llm    # this will form a langchain chain ie you are getting a prompt and passing it to LLM 
res = chain_extract.invoke(input={'page_data':page_data})
print(res.content)

# we got the json format of the job description

[
  {
    "role": "Data Scientist",
    "experience": "1+ years of relevant industry experience with a Bachelor’s degree or Master’s/PhD in Computer Science, Mathematics, Statistics/related fields",
    "skills": [
      "Python or one other high-level programming language",
      "Theoretical understanding of statistical models such as regression, clustering and ML algorithms such as decision trees, neural networks, etc.",
      "Machine learning frameworks like TensorFlow, PyTorch, or scikit-learn",
      "SQL and/or NoSQL databases"
    ],
    "description": "Design, develop and deploy machine learning models, algorithms and systems to solve complex business problems for Myntra Recsys, Search, Vision, SCM, Pricing, Forecasting, Trend and Virality prediction, Gen AI and other areas. Theoretical understanding and practise of machine learning and expertise in one or more of the topics, such as, NLP, Computer Vision, recommender systems and Optimisation."
  }
]


In [7]:
# but the type of it is string, we want json object so we will use JSON Parser
type(res.content)

str

In [8]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

[{'role': 'Data Scientist',
  'experience': '1+ years of relevant industry experience with a Bachelor’s degree or Master’s/PhD in Computer Science, Mathematics, Statistics/related fields',
  'skills': ['Python or one other high-level programming language',
   'Theoretical understanding of statistical models such as regression, clustering and ML algorithms such as decision trees, neural networks, etc.',
   'Machine learning frameworks like TensorFlow, PyTorch, or scikit-learn',
   'SQL and/or NoSQL databases'],
  'description': 'Design, develop and deploy machine learning models, algorithms and systems to solve complex business problems for Myntra Recsys, Search, Vision, SCM, Pricing, Forecasting, Trend and Virality prediction, Gen AI and other areas. Theoretical understanding and practise of machine learning and expertise in one or more of the topics, such as, NLP, Computer Vision, recommender systems and Optimisation.'}]

In [9]:
len(json_res)

1

In [10]:
type(json_res)
# but we want a dictionary

list

In [11]:
# Check if the result is a list and extract the first dictionary
if isinstance(json_res, list):
    json_res = json_res[0]

In [12]:
json_res

{'role': 'Data Scientist',
 'experience': '1+ years of relevant industry experience with a Bachelor’s degree or Master’s/PhD in Computer Science, Mathematics, Statistics/related fields',
 'skills': ['Python or one other high-level programming language',
  'Theoretical understanding of statistical models such as regression, clustering and ML algorithms such as decision trees, neural networks, etc.',
  'Machine learning frameworks like TensorFlow, PyTorch, or scikit-learn',
  'SQL and/or NoSQL databases'],
 'description': 'Design, develop and deploy machine learning models, algorithms and systems to solve complex business problems for Myntra Recsys, Search, Vision, SCM, Pricing, Forecasting, Trend and Virality prediction, Gen AI and other areas. Theoretical understanding and practise of machine learning and expertise in one or more of the topics, such as, NLP, Computer Vision, recommender systems and Optimisation.'}

In [13]:
# now its a dicitionary

In [14]:
# so whenever there is a job posting, we will extract this skills from the job 
# and we will match it with one or multiple of these technologies mentioned in the csv file and it will retrive those portfolio urls
# which we will use while writing an email

import pandas as pd

df = pd.read_csv("my_portfolio.csv")
df

Unnamed: 0,Techstack,Links
0,"Machine Learning, ML, Python",https://github.com/MandarBhalerao/Gurgaon-Real...
1,"Recommendation System, Python",https://github.com/MandarBhalerao/Movie-Recomm...
2,"C++, CUDA",https://github.com/MandarBhalerao/Dilated-Conv...
3,"React, Node.js, MongoDB",https://example.com/react-portfolio
4,"Angular,.NET, SQL Server",https://example.com/angular-portfolio
5,"Vue.js, Ruby on Rails, PostgreSQL",https://example.com/vue-portfolio
6,"Java, Spring Boot, Oracle",https://example.com/java-portfolio
7,"Flutter, Firebase, GraphQL",https://example.com/flutter-portfolio
8,"WordPress, PHP, MySQL",https://example.com/wordpress-portfolio
9,"Magento, PHP, MySQL",https://example.com/magento-portfolio


In [15]:
import uuid
import chromadb

# when you use Client, it will create a chromadb in memory
# but when we use PersistentClient it will create a chromadb on a disk ie it will be stored in our current folder so that we can retrive it anytime
client = chromadb.PersistentClient('vectorstore')
collection = client.get_or_create_collection(name="portfolio")


if not collection.count():     # this means if collection does not have any count ie if it is being created for the first time
    for _, row in df.iterrows():    # then you iterate through all your dataframe rows, and for each row, you are adding a document
        collection.add(documents=row["Techstack"],
                       metadatas={"links": row["Links"]},
                       ids=[str(uuid.uuid4())])

In [16]:
# a folder named vectorstore will be created and data will be stored there

In [17]:
job = json_res

In [18]:
# just making a query and checking

links = collection.query(query_texts=job['skills'], n_results=2).get('metadatas', [])
links

[[{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/python-portfolio'}],
 [{'links': 'https://example.com/ml-python-portfolio'},
  {'links': 'https://example.com/ios-ar-portfolio'}],
 [{'links': 'https://example.com/magento-portfolio'},
  {'links': 'https://example.com/angular-portfolio'}]]

In [19]:
job

{'role': 'Data Scientist',
 'experience': '1+ years of relevant industry experience with a Bachelor’s degree or Master’s/PhD in Computer Science, Mathematics, Statistics/related fields',
 'skills': ['Python or one other high-level programming language',
  'Theoretical understanding of statistical models such as regression, clustering and ML algorithms such as decision trees, neural networks, etc.',
  'Machine learning frameworks like TensorFlow, PyTorch, or scikit-learn',
  'SQL and/or NoSQL databases'],
 'description': 'Design, develop and deploy machine learning models, algorithms and systems to solve complex business problems for Myntra Recsys, Search, Vision, SCM, Pricing, Forecasting, Trend and Virality prediction, Gen AI and other areas. Theoretical understanding and practise of machine learning and expertise in one or more of the topics, such as, NLP, Computer Vision, recommender systems and Optimisation.'}

In [20]:
job['skills']

['Python or one other high-level programming language',
 'Theoretical understanding of statistical models such as regression, clustering and ML algorithms such as decision trees, neural networks, etc.',
 'Machine learning frameworks like TensorFlow, PyTorch, or scikit-learn',
 'SQL and/or NoSQL databases']

In [21]:
# this is prompt template for writing an email

prompt_email = PromptTemplate.from_template(
        """
        ### JOB DESCRIPTION:
        {job_description}

        ### INSTRUCTION:
        You are Mandar Bhalerao, an MTech student at the Indian Institute of Science, Bangalore, focusing on Computer Science and Automation. Your academic journey is complemented by hands-on internships where you've applied cutting-edge machine learning and deep learning techniques to real-world problems.

        Your task is to write a cold email to the hiring manager detailing your experiences and projects that highlight your expertise in AI and machine learning. Start with introducing yourself using the above details and then discuss your role in enhancing the performance of Stable Diffusion models by using Knowledge Distillation Techniques at NeuroPixel.AI , achieving a 30 percent reduction in inference steps. Elaborate on your project, "Gurgaon Real Estate Price Prediction," where you implemented advanced machine learning models to achieve an R² score of 0.90 and developed a dual-layer recommendation system.

        Also, include your experience at Western Union, where you used Quantum Metric to improve user experience design, increasing conversion rates by 10%. Provide insights into your technical skills, particularly in Python and C++, and how these have supported your project implementations.

        Remember, you are Mandar, with a strong foundation in theoretical knowledge and practical application of machine learning, deep learning and AI technologies. Discuss the methodologies you employed, the challenges you overcame, and the real-world impact of your projects.
        
        Your job is to write a cold email to the hiring manager regarding the job mentioned above describing the capability of you 
        in fulfilling their needs.
        
        Also add the most relevant ones from the following links to showcase Mandar's work in these domains: {link_list}
        Remember you are Mandar Bhalerao, an MTech student at the Indian Institute of Science, Bangalore.
        End the email with Mandar Bhalerao, (new line) MTech in Computer Science and Automation, (new line) IISc Bangalore. 
        Do not provide a preamble.
        ### EMAIL (NO PREAMBLE):

        """
        )

# the things inside curly brackets like {link_list}, then this is something we will give as an argument to a prompt template

In [22]:
# again creating a chain of prompt_email and llm
# invoking the chain by passing the parameter of job_description and link_list

chain_email = prompt_email | llm
res = chain_email.invoke({"job_description": str(job), "link_list": links})
print(res.content)

Subject: Application for Data Scientist Role at Myntra

Dear Hiring Manager,

I am Mandar Bhalerao, an MTech student at the Indian Institute of Science, Bangalore, with a strong foundation in Computer Science and Automation. I am excited to apply for the Data Scientist role at Myntra, where I can leverage my expertise in machine learning and AI to drive business growth.

As a hands-on practitioner with a solid theoretical understanding of statistical models and machine learning algorithms, I am confident in my ability to design, develop, and deploy models that solve complex business problems. My experience in applying cutting-edge techniques to real-world problems has equipped me with the skills to tackle challenges in areas such as NLP, Computer Vision, recommender systems, and optimization.

One of my notable projects was at NeuroPixel.AI, where I worked on enhancing the performance of Stable Diffusion models using Knowledge Distillation Techniques. By employing this approach, I achi