Log in, get basic data set, calculate chars to token ratio
This commit is contained in:
parent
7f6ed97c8e
commit
e3b22c17ac
87
Tuner.py
Normal file
87
Tuner.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
from huggingface_hub import HfApi
|
||||||
|
# main.py
|
||||||
|
from ModelParameter import (
|
||||||
|
MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
|
||||||
|
GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
|
||||||
|
EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
|
||||||
|
FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
|
||||||
|
USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now you can use these variables in your code
|
||||||
|
print(MODEL) # Output: bigcode/starcoderbase-1b
|
||||||
|
print(SEQ_LENGTH) # Output: 2048
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
AutoModelForCausalLM,
|
||||||
|
AutoTokenizer,
|
||||||
|
Trainer,
|
||||||
|
TrainingArguments,
|
||||||
|
logging,
|
||||||
|
set_seed,
|
||||||
|
BitsAndBytesConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
set_seed(SEED)
|
||||||
|
|
||||||
|
# Step 1: Save your Hugging Face API token
|
||||||
|
# You can obtain this from https://huggingface.co/settings/tokens
|
||||||
|
api_token = "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"
|
||||||
|
from huggingface_hub import HfFolder
|
||||||
|
HfFolder.save_token(api_token)
|
||||||
|
|
||||||
|
# Step 2: Use the authenticated client to interact with the model hub
|
||||||
|
hf_api = HfApi()
|
||||||
|
|
||||||
|
# Example: List models you've uploaded by specifying your username in the search query
|
||||||
|
user_name = "tobjend" # Replace with your Hugging Face username
|
||||||
|
|
||||||
|
# Using filters to get only models from a specific user
|
||||||
|
models = hf_api.list_models(author=user_name)
|
||||||
|
# Print me the info from the first model but only if it's actually there
|
||||||
|
|
||||||
|
from datasets import load_dataset
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
|
||||||
|
# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
|
||||||
|
# downloading the whole dataset at once.
|
||||||
|
dataset = load_dataset(
|
||||||
|
DATASET,
|
||||||
|
data_dir="data",
|
||||||
|
split="train",
|
||||||
|
streaming=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
valid_data = dataset.take(4000)
|
||||||
|
train_data = dataset.skip(4000)
|
||||||
|
train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
|
||||||
|
|
||||||
|
# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
|
||||||
|
# of fixed length. Let’s create an Iterable dataset that would return constant-length chunks of tokens from
|
||||||
|
# a stream of text files.
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
|
||||||
|
|
||||||
|
|
||||||
|
def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
|
||||||
|
"""
|
||||||
|
Estimate the average number of characters per token in the dataset.
|
||||||
|
The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
|
||||||
|
For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
|
||||||
|
which is not very meaningful. This would indicate poor tokenization.
|
||||||
|
In standard English text, one token is typically equivalent to approximately four characters, meaning the
|
||||||
|
character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
|
||||||
|
a number between 2.0 and 3.5 can be considered good enough.
|
||||||
|
"""
|
||||||
|
|
||||||
|
total_characters, total_tokens = 0, 0
|
||||||
|
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
|
||||||
|
total_characters += len(example[data_column])
|
||||||
|
total_tokens += len(tokenizer(example[data_column]).tokens())
|
||||||
|
|
||||||
|
return total_characters / total_tokens
|
||||||
|
|
||||||
|
chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
|
||||||
|
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user