4 changed files with 212 additions and 173 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,170 +0,0 @@
-# ---> Python
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
--- a/ModelParameter.py
+++ b/ModelParameter.py
@ -0,0 +1,125 @@
+MODEL = "bigcode/starcoderbase-1b"
+# Model checkpoint on the Hugging Face Hub.
+
+DATASET = "smangrul/hf-stack-v1"
+# Dataset on the Hugging Face Hub.
+
+DATA_COLUMN = "content"
+# Column name containing the code content.
+
+SEQ_LENGTH = 2048
+# Maximum sequence length for input data.
+
+## Training arguments
+MAX_STEPS = 2000
+# Maximum number of training steps.
+
+BATCH_SIZE = 16
+# Number of samples per batch.
+
+GR_ACC_STEPS = 1
+# Steps to accumulate gradients before updating.
+
+LR = 5e-4
+# Learning rate for training.
+
+LR_SCHEDULER_TYPE = "cosine"
+# Specifies the type of learning rate scheduler to be used during training.
+# A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model.
+# The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time.
+# This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses.
+# Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially.
+# The choice of scheduler can significantly impact the training dynamics and the final performance of the model.
+
+
+WEIGHT_DECAY = 0.01
+# Regularization to prevent overfitting.
+
+NUM_WARMUP_STEPS = 30
+# Steps for learning rate warmup.
+
+EVAL_FREQ = 100
+# Frequency of evaluation during training.
+
+SAVE_FREQ = 100
+# Frequency of saving model checkpoints.
+
+LOG_FREQ = 25
+# Frequency of logging training metrics.
+
+OUTPUT_DIR = "peft-starcoder-lora-a100"
+# Directory for output files.
+
+BF16 = True
+# Enables the use of bfloat16 (Brain Floating Point) precision for training the model.
+# Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability.
+# It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage.
+# This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs.
+# Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy.
+
+FP16 = False
+# Do not use float16 precision.
+
+# FIM transformations arguments
+FIM_RATE = 0.5
+# Rate of Fill-in-the-Middle transformations.
+
+FIM_SPM_RATE = 0.5
+# Rate of Sentence Piece Model transformations.
+
+# LORA
+LORA_R = 8
+# Specifies the rank of the Low-Rank Adaptation (LoRA) matrices.
+# In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters.
+# A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data.
+# However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage.
+# Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive.
+# The choice of rank is a trade-off between model capacity and computational efficiency.
+
+
+LORA_ALPHA = 32
+# Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices.
+# The scaling factor controls the magnitude of the updates applied to the model parameters during training.
+# It is used to adjust the impact of the low-rank updates on the original model weights.
+# A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks.
+# Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability.
+# This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model.
+
+
+LORA_DROPOUT = 0.0
+# Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training.
+# Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training.
+# This forces the model to learn more robust features that are not dependent on specific weights.
+# In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model.
+# A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step.
+# Applying dropout can improve the generalization of the model to new, unseen data.
+
+LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"
+# Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied.
+# LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers.
+# The target modules listed here are key components of the transformer architecture, including:
+# - `c_proj`: The projection layer in the attention mechanism.
+# - `c_attn`: The attention layer that computes the attention scores.
+# - `q_attn`: The query part of the attention mechanism.
+# - `c_fc`: The feedforward layer in the transformer block.
+# - `c_proj`: Another projection layer, often used in the output of the attention mechanism.
+# By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements.
+
+# bitsandbytes config
+USE_NESTED_QUANT = True
+# Enable nested quantization to enhance computational efficiency.
+# Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths.
+# This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers.
+# By using nested quantization, the model can be trained or run with significantly less memory,
+# allowing for faster computations and the ability to handle larger models on limited hardware.
+# This is particularly useful in scenarios where computational resources are constrained.
+
+BNB_4BIT_COMPUTE_DTYPE = "bfloat16"
+# Data type used for computation when applying 4-bit quantization.
+# Quantization reduces the precision of model weights to lower memory usage and computational requirements.
+# 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint.
+# The compute data type (bfloat16 in this case) is used for calculations during training or inference,
+# balancing precision and computational efficiency.
+
+SEED = 0
+# Random seed for reproducibility.
--- a/README.md
+++ b/README.md
@ -1,3 +0,0 @@
-# fine_tuning
-
-https://huggingface.co/learn/cookbook/fine_tuning_code_llm_on_single_gpu
--- a/Tuner.py
+++ b/Tuner.py
@ -0,0 +1,87 @@
+from huggingface_hub import HfApi
+# main.py
+from ModelParameter import (
+    MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
+    GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
+    EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
+    FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
+    USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
+)
+
+# Now you can use these variables in your code
+print(MODEL)  # Output: bigcode/starcoderbase-1b
+print(SEQ_LENGTH)  # Output: 2048
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    logging,
+    set_seed,
+    BitsAndBytesConfig,
+)
+
+set_seed(SEED)
+
+# Step 1: Save your Hugging Face API token
+# You can obtain this from https://huggingface.co/settings/tokens
+api_token = "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"
+from huggingface_hub import HfFolder
+HfFolder.save_token(api_token)
+
+# Step 2: Use the authenticated client to interact with the model hub
+hf_api = HfApi()
+
+# Example: List models you've uploaded by specifying your username in the search query
+user_name = "tobjend"  # Replace with your Hugging Face username
+
+# Using filters to get only models from a specific user
+models = hf_api.list_models(author=user_name)
+# Print me the info from the first model but only if it's actually there
+
+from datasets import load_dataset
+import torch
+from tqdm import tqdm
+
+# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
+# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
+# downloading the whole dataset at once.
+dataset = load_dataset(
+    DATASET,
+    data_dir="data",
+    split="train",
+    streaming=True,
+)
+
+valid_data = dataset.take(4000)
+train_data = dataset.skip(4000)
+train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
+
+# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
+# of fixed length. Let’s create an Iterable dataset that would return constant-length chunks of tokens from
+# a stream of text files.
+tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+
+
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
+    For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
+    which is not very meaningful. This would indicate poor tokenization.
+    In standard English text, one token is typically equivalent to approximately four characters, meaning the
+    character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
+    a number between 2.0 and 3.5 can be considered good enough.
+    """
+
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+
+    return total_characters / total_tokens
+
+chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
+print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+