fine_tuning/Tuner.py

88 lines
3.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from huggingface_hub import HfApi
# main.py
from ModelParameter import (
MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
)
# Now you can use these variables in your code
print(MODEL) # Output: bigcode/starcoderbase-1b
print(SEQ_LENGTH) # Output: 2048
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
logging,
set_seed,
BitsAndBytesConfig,
)
set_seed(SEED)
# Step 1: Save your Hugging Face API token
# You can obtain this from https://huggingface.co/settings/tokens
api_token = "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"
from huggingface_hub import HfFolder
HfFolder.save_token(api_token)
# Step 2: Use the authenticated client to interact with the model hub
hf_api = HfApi()
# Example: List models you've uploaded by specifying your username in the search query
user_name = "tobjend" # Replace with your Hugging Face username
# Using filters to get only models from a specific user
models = hf_api.list_models(author=user_name)
# Print me the info from the first model but only if it's actually there
from datasets import load_dataset
import torch
from tqdm import tqdm
# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
# downloading the whole dataset at once.
dataset = load_dataset(
DATASET,
data_dir="data",
split="train",
streaming=True,
)
valid_data = dataset.take(4000)
train_data = dataset.skip(4000)
train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
# of fixed length. Lets create an Iterable dataset that would return constant-length chunks of tokens from
# a stream of text files.
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
"""
Estimate the average number of characters per token in the dataset.
The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
which is not very meaningful. This would indicate poor tokenization.
In standard English text, one token is typically equivalent to approximately four characters, meaning the
character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
a number between 2.0 and 3.5 can be considered good enough.
"""
total_characters, total_tokens = 0, 0
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
total_characters += len(example[data_column])
total_tokens += len(tokenizer(example[data_column]).tokens())
return total_characters / total_tokens
chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")