From e3b22c17ac8ed2103b26fcc2853c45a5140c25a6 Mon Sep 17 00:00:00 2001 From: "Tobias J. Endres" Date: Sat, 15 Feb 2025 23:28:09 +0100 Subject: [PATCH] Log in, get basic data set, calculate chars to token ratio --- Tuner.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 Tuner.py diff --git a/Tuner.py b/Tuner.py new file mode 100644 index 0000000..f75d96f --- /dev/null +++ b/Tuner.py @@ -0,0 +1,87 @@ +from huggingface_hub import HfApi +# main.py +from ModelParameter import ( + MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE, + GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS, + EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE, + FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES, + USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED +) + +# Now you can use these variables in your code +print(MODEL) # Output: bigcode/starcoderbase-1b +print(SEQ_LENGTH) # Output: 2048 + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments, + logging, + set_seed, + BitsAndBytesConfig, +) + +set_seed(SEED) + +# Step 1: Save your Hugging Face API token +# You can obtain this from https://huggingface.co/settings/tokens +api_token = "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" +from huggingface_hub import HfFolder +HfFolder.save_token(api_token) + +# Step 2: Use the authenticated client to interact with the model hub +hf_api = HfApi() + +# Example: List models you've uploaded by specifying your username in the search query +user_name = "tobjend" # Replace with your Hugging Face username + +# Using filters to get only models from a specific user +models = hf_api.list_models(author=user_name) +# Print me the info from the first model but only if it's actually there + +from datasets import load_dataset +import torch +from tqdm import tqdm + +# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming +# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of +# downloading the whole dataset at once. +dataset = load_dataset( + DATASET, + data_dir="data", + split="train", + streaming=True, +) + +valid_data = dataset.take(4000) +train_data = dataset.skip(4000) +train_data = train_data.shuffle(buffer_size=5000, seed=SEED) + +# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs +# of fixed length. Let’s create an Iterable dataset that would return constant-length chunks of tokens from +# a stream of text files. +tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) + + +def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400): + """ + Estimate the average number of characters per token in the dataset. + The character-to-token ratio can also be used as an indicator of the quality of text tokenization. + For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token, + which is not very meaningful. This would indicate poor tokenization. + In standard English text, one token is typically equivalent to approximately four characters, meaning the + character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking, + a number between 2.0 and 3.5 can be considered good enough. + """ + + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): + total_characters += len(example[data_column]) + total_tokens += len(tokenizer(example[data_column]).tokens()) + + return total_characters / total_tokens + +chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN) +print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") +