Log in, get basic data set, calculate chars to token ratio

2025-02-15 23:28:09 +01:00 · 2025-02-15 23:28:09 +01:00 · d3cb25c373
commit d3cb25c373
parent a1473582ec
1 changed files with 87 additions and 0 deletions
--- a/Tuner.py
+++ b/Tuner.py
@ -0,0 +1,87 @@
+from huggingface_hub import HfApi
+# main.py
+from ModelParameter import (
+    MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
+    GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
+    EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
+    FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
+    USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
+)
+
+# Now you can use these variables in your code
+print(MODEL)  # Output: bigcode/starcoderbase-1b
+print(SEQ_LENGTH)  # Output: 2048
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    logging,
+    set_seed,
+    BitsAndBytesConfig,
+)
+
+set_seed(SEED)
+
+# Step 1: Save your Hugging Face API token
+# You can obtain this from https://huggingface.co/settings/tokens
+api_token = "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"
+from huggingface_hub import HfFolder
+HfFolder.save_token(api_token)
+
+# Step 2: Use the authenticated client to interact with the model hub
+hf_api = HfApi()
+
+# Example: List models you've uploaded by specifying your username in the search query
+user_name = "tobjend"  # Replace with your Hugging Face username
+
+# Using filters to get only models from a specific user
+models = hf_api.list_models(author=user_name)
+# Print me the info from the first model but only if it's actually there
+
+from datasets import load_dataset
+import torch
+from tqdm import tqdm
+
+# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
+# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
+# downloading the whole dataset at once.
+dataset = load_dataset(
+    DATASET,
+    data_dir="data",
+    split="train",
+    streaming=True,
+)
+
+valid_data = dataset.take(4000)
+train_data = dataset.skip(4000)
+train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
+
+# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
+# of fixed length. Let’s create an Iterable dataset that would return constant-length chunks of tokens from
+# a stream of text files.
+tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+
+
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
+    For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
+    which is not very meaningful. This would indicate poor tokenization.
+    In standard English text, one token is typically equivalent to approximately four characters, meaning the
+    character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
+    a number between 2.0 and 3.5 can be considered good enough.
+    """
+
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+
+    return total_characters / total_tokens
+
+chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
+print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+