rezacsedu's picture
Update README.md
6e0258f verified
|
raw
history blame
6.22 kB
metadata
license: mit
datasets:
  - FinGPT/fingpt-sentiment-train
language:
  - en
metrics:
  - accuracy
library_name: transformers
pipeline_tag: text-classification
widget:
  - text: >-
      The current lay-offs are additional to the temporary lay-offs agreed in
      December 2008 and in May 2009.
  - text: >-
      Last month we added to our $GILD position and started a new one in $BWLD.
      We see slow, steady, unspectacular growth going forward near term.

Model Card for Model ID

https://huggingface.co/rezacsedu/financial_sentiment_analysis_gpt2_model

Model Details

Model Description

This a fine-tuned GPT2 model on the https://huggingface.co/datasets/FinGPT/fingpt-sentiment-train dataset for the downstream financial sentiment analysis.

Model Sources

Uses

The model is already fine-tuned for downstream financial sentiment analysis tasks.

import torch

# Load your fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("fine_tuned_finsetiment_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_finsetiment_model")

# Define the label mapping as provided
label_mapping_reverse = {
'0': 'Mildly positive',
'1': 'Mildly negative',
'2': 'Moderately negative',
'3': 'Moderately positive',
'4': 'Positive',
'5': 'Negative',
'6': 'Neutral',
'7': 'Strongly negative',
'8': 'Strongly positive'
}

def model_predict(text):
  # Tokenize the input text
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

  # Get predictions from the model
  with torch.no_grad():
    logits = model(**inputs).logits

    # Convert to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Create a list of tuples with label and probability
    label_prob_pairs = [(label_mapping_reverse[label_idx], prob.item()) 
    for label_idx, prob in enumerate(probabilities.squeeze())]

    # Sort the list by probability in descending order
    sorted_label_prob_pairs = sorted(label_prob_pairs, key=lambda pair: pair[1], reverse=True)

  # Return the sorted list of label-probability pairs
  return sorted_label_prob_pairs

# Example usage
text = "Intel Corporation (NASDAQ: INTC) has unveiled a remote verification platform called Project Amber"
predictions = model_predict(text)
for label, prob in predictions:
  print(f"{label}: {prob:.3f}")

Training Details

Training Data

from transformers import GPT2Tokenizer

dataset = load_dataset("FinGPT/fingpt-sentiment-train")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
   return tokenizer(examples["input"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

from datasets import DatasetDict
import random
import string

def generate_random_id():
  return ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))

unique_outputs = set(dataset['train']['output'])

#label_mapping = {'mildly positive': 0, 'positive': 1, 'strong positive':2, 'moderately positive': 3, 'negative': 4, 'neutral': 5}  # Add more mappings as needed
label_mapping = {label: index for index, label in enumerate(unique_outputs)}

def transform_dataset(dataset):
  dataset = dataset.rename_column('input', 'text')
  dataset = dataset.rename_column('output', 'label_text')

  dataset = dataset.remove_columns(['instruction'])

  dataset = dataset.add_column('id', [generate_random_id() for _ in range(dataset.num_rows)])
  dataset = dataset.add_column('label', [label_mapping[label_text] for label_text in dataset['label_text']])

  return dataset

transformed_dataset = DatasetDict({'train': transform_dataset(tokenized_datasets['train'])})
transformed_dataset['train'].set_format(type=None, columns=['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'])

train_test_split = transformed_dataset['train'].train_test_split(test_size=0.3, seed=42)

tokenized_datasets['test'] = train_test_split['test']
tokenized_datasets['train'] = train_test_split['train']

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

Fine-tune Procedure

from transformers import GPT2ForSequenceClassification
from transformers import TrainingArguments, Trainer

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=9)

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
trainer.save_model("fine_tuned_finsetiment_model")

Training Hyperparameters

  • Training regime: [More Information Needed]

Speeds, Sizes, Times [optional]

Evaluation

import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)

   return metric.compute(predictions=predictions, references=labels)

Summary

Citation [optional]

BibTeX:

Model Card Contact

[email protected]