Compare commits

...

No commits in common. "main" and "d3cb25c3731eb5b39a42e3bc63b615873402eaf5" have entirely different histories.

4 changed files with 212 additions and 173 deletions

170
.gitignore vendored
View File

@ -1,170 +0,0 @@
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

125
ModelParameter.py Normal file
View File

@ -0,0 +1,125 @@
MODEL = "bigcode/starcoderbase-1b"
# Model checkpoint on the Hugging Face Hub.
DATASET = "smangrul/hf-stack-v1"
# Dataset on the Hugging Face Hub.
DATA_COLUMN = "content"
# Column name containing the code content.
SEQ_LENGTH = 2048
# Maximum sequence length for input data.
## Training arguments
MAX_STEPS = 2000
# Maximum number of training steps.
BATCH_SIZE = 16
# Number of samples per batch.
GR_ACC_STEPS = 1
# Steps to accumulate gradients before updating.
LR = 5e-4
# Learning rate for training.
LR_SCHEDULER_TYPE = "cosine"
# Specifies the type of learning rate scheduler to be used during training.
# A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model.
# The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time.
# This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses.
# Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially.
# The choice of scheduler can significantly impact the training dynamics and the final performance of the model.
WEIGHT_DECAY = 0.01
# Regularization to prevent overfitting.
NUM_WARMUP_STEPS = 30
# Steps for learning rate warmup.
EVAL_FREQ = 100
# Frequency of evaluation during training.
SAVE_FREQ = 100
# Frequency of saving model checkpoints.
LOG_FREQ = 25
# Frequency of logging training metrics.
OUTPUT_DIR = "peft-starcoder-lora-a100"
# Directory for output files.
BF16 = True
# Enables the use of bfloat16 (Brain Floating Point) precision for training the model.
# Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability.
# It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage.
# This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs.
# Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy.
FP16 = False
# Do not use float16 precision.
# FIM transformations arguments
FIM_RATE = 0.5
# Rate of Fill-in-the-Middle transformations.
FIM_SPM_RATE = 0.5
# Rate of Sentence Piece Model transformations.
# LORA
LORA_R = 8
# Specifies the rank of the Low-Rank Adaptation (LoRA) matrices.
# In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters.
# A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data.
# However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage.
# Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive.
# The choice of rank is a trade-off between model capacity and computational efficiency.
LORA_ALPHA = 32
# Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices.
# The scaling factor controls the magnitude of the updates applied to the model parameters during training.
# It is used to adjust the impact of the low-rank updates on the original model weights.
# A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks.
# Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability.
# This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model.
LORA_DROPOUT = 0.0
# Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training.
# Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training.
# This forces the model to learn more robust features that are not dependent on specific weights.
# In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model.
# A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step.
# Applying dropout can improve the generalization of the model to new, unseen data.
LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"
# Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied.
# LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers.
# The target modules listed here are key components of the transformer architecture, including:
# - `c_proj`: The projection layer in the attention mechanism.
# - `c_attn`: The attention layer that computes the attention scores.
# - `q_attn`: The query part of the attention mechanism.
# - `c_fc`: The feedforward layer in the transformer block.
# - `c_proj`: Another projection layer, often used in the output of the attention mechanism.
# By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements.
# bitsandbytes config
USE_NESTED_QUANT = True
# Enable nested quantization to enhance computational efficiency.
# Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths.
# This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers.
# By using nested quantization, the model can be trained or run with significantly less memory,
# allowing for faster computations and the ability to handle larger models on limited hardware.
# This is particularly useful in scenarios where computational resources are constrained.
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"
# Data type used for computation when applying 4-bit quantization.
# Quantization reduces the precision of model weights to lower memory usage and computational requirements.
# 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint.
# The compute data type (bfloat16 in this case) is used for calculations during training or inference,
# balancing precision and computational efficiency.
SEED = 0
# Random seed for reproducibility.

View File

@ -1,3 +0,0 @@
# fine_tuning
https://huggingface.co/learn/cookbook/fine_tuning_code_llm_on_single_gpu

87
Tuner.py Normal file
View File

@ -0,0 +1,87 @@
from huggingface_hub import HfApi
# main.py
from ModelParameter import (
MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
)
# Now you can use these variables in your code
print(MODEL) # Output: bigcode/starcoderbase-1b
print(SEQ_LENGTH) # Output: 2048
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments,
logging,
set_seed,
BitsAndBytesConfig,
)
set_seed(SEED)
# Step 1: Save your Hugging Face API token
# You can obtain this from https://huggingface.co/settings/tokens
api_token = "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"
from huggingface_hub import HfFolder
HfFolder.save_token(api_token)
# Step 2: Use the authenticated client to interact with the model hub
hf_api = HfApi()
# Example: List models you've uploaded by specifying your username in the search query
user_name = "tobjend" # Replace with your Hugging Face username
# Using filters to get only models from a specific user
models = hf_api.list_models(author=user_name)
# Print me the info from the first model but only if it's actually there
from datasets import load_dataset
import torch
from tqdm import tqdm
# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
# downloading the whole dataset at once.
dataset = load_dataset(
DATASET,
data_dir="data",
split="train",
streaming=True,
)
valid_data = dataset.take(4000)
train_data = dataset.skip(4000)
train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
# of fixed length. Lets create an Iterable dataset that would return constant-length chunks of tokens from
# a stream of text files.
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
"""
Estimate the average number of characters per token in the dataset.
The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
which is not very meaningful. This would indicate poor tokenization.
In standard English text, one token is typically equivalent to approximately four characters, meaning the
character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
a number between 2.0 and 3.5 can be considered good enough.
"""
total_characters, total_tokens = 0, 0
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
total_characters += len(example[data_column])
total_tokens += len(tokenizer(example[data_column]).tokens())
return total_characters / total_tokens
chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")