Compare commits
No commits in common. "main" and "d3cb25c3731eb5b39a42e3bc63b615873402eaf5" have entirely different histories.
main
...
d3cb25c373
170
.gitignore
vendored
170
.gitignore
vendored
@ -1,170 +0,0 @@
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
125
ModelParameter.py
Normal file
125
ModelParameter.py
Normal file
@ -0,0 +1,125 @@
|
||||
MODEL = "bigcode/starcoderbase-1b"
|
||||
# Model checkpoint on the Hugging Face Hub.
|
||||
|
||||
DATASET = "smangrul/hf-stack-v1"
|
||||
# Dataset on the Hugging Face Hub.
|
||||
|
||||
DATA_COLUMN = "content"
|
||||
# Column name containing the code content.
|
||||
|
||||
SEQ_LENGTH = 2048
|
||||
# Maximum sequence length for input data.
|
||||
|
||||
## Training arguments
|
||||
MAX_STEPS = 2000
|
||||
# Maximum number of training steps.
|
||||
|
||||
BATCH_SIZE = 16
|
||||
# Number of samples per batch.
|
||||
|
||||
GR_ACC_STEPS = 1
|
||||
# Steps to accumulate gradients before updating.
|
||||
|
||||
LR = 5e-4
|
||||
# Learning rate for training.
|
||||
|
||||
LR_SCHEDULER_TYPE = "cosine"
|
||||
# Specifies the type of learning rate scheduler to be used during training.
|
||||
# A learning rate scheduler adjusts the learning rate over the course of training to optimize the convergence and performance of the model.
|
||||
# The "cosine" scheduler, in particular, adjusts the learning rate following a cosine curve, which gradually decreases the learning rate over time.
|
||||
# This approach helps in initially making large updates to the model parameters and then fine-tuning them with smaller updates as training progresses.
|
||||
# Other common scheduler types include "linear," which decreases the learning rate linearly, and "exponential," which decreases it exponentially.
|
||||
# The choice of scheduler can significantly impact the training dynamics and the final performance of the model.
|
||||
|
||||
|
||||
WEIGHT_DECAY = 0.01
|
||||
# Regularization to prevent overfitting.
|
||||
|
||||
NUM_WARMUP_STEPS = 30
|
||||
# Steps for learning rate warmup.
|
||||
|
||||
EVAL_FREQ = 100
|
||||
# Frequency of evaluation during training.
|
||||
|
||||
SAVE_FREQ = 100
|
||||
# Frequency of saving model checkpoints.
|
||||
|
||||
LOG_FREQ = 25
|
||||
# Frequency of logging training metrics.
|
||||
|
||||
OUTPUT_DIR = "peft-starcoder-lora-a100"
|
||||
# Directory for output files.
|
||||
|
||||
BF16 = True
|
||||
# Enables the use of bfloat16 (Brain Floating Point) precision for training the model.
|
||||
# Bfloat16 is a 16-bit floating-point format designed to provide a good balance between computational efficiency and numerical stability.
|
||||
# It offers similar range to 32-bit floating-point numbers but with reduced precision, which can speed up training and reduce memory usage.
|
||||
# This is particularly useful when training large models on hardware that supports bfloat16, such as certain GPUs and TPUs.
|
||||
# Using bfloat16 can lead to faster computations and lower memory requirements without significantly compromising model accuracy.
|
||||
|
||||
FP16 = False
|
||||
# Do not use float16 precision.
|
||||
|
||||
# FIM transformations arguments
|
||||
FIM_RATE = 0.5
|
||||
# Rate of Fill-in-the-Middle transformations.
|
||||
|
||||
FIM_SPM_RATE = 0.5
|
||||
# Rate of Sentence Piece Model transformations.
|
||||
|
||||
# LORA
|
||||
LORA_R = 8
|
||||
# Specifies the rank of the Low-Rank Adaptation (LoRA) matrices.
|
||||
# In the context of LoRA, the rank determines the dimensionality of the low-rank updates applied to the model's parameters.
|
||||
# A higher rank allows for more expressive power and flexibility in the adaptation, potentially capturing more complex patterns in the data.
|
||||
# However, it also increases the number of trainable parameters, which can lead to higher computational costs and memory usage.
|
||||
# Conversely, a lower rank reduces the number of parameters, making the adaptation more efficient but potentially less expressive.
|
||||
# The choice of rank is a trade-off between model capacity and computational efficiency.
|
||||
|
||||
|
||||
LORA_ALPHA = 32
|
||||
# Specifies the scaling factor applied to the Low-Rank Adaptation (LoRA) matrices.
|
||||
# The scaling factor controls the magnitude of the updates applied to the model parameters during training.
|
||||
# It is used to adjust the impact of the low-rank updates on the original model weights.
|
||||
# A higher scaling factor amplifies the effect of the low-rank updates, potentially allowing the model to adapt more quickly to new tasks.
|
||||
# Conversely, a lower scaling factor reduces the impact, which can be useful for fine-tuning stability.
|
||||
# This parameter is crucial for balancing the adaptation of the model while maintaining the stability and performance of the original model.
|
||||
|
||||
|
||||
LORA_DROPOUT = 0.0
|
||||
# Specifies the dropout rate applied to the Low-Rank Adaptation (LoRA) matrices during training.
|
||||
# Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero at each update during training.
|
||||
# This forces the model to learn more robust features that are not dependent on specific weights.
|
||||
# In the context of LoRA, dropout is applied to the low-rank matrices that are introduced into the model.
|
||||
# A dropout rate of 0.0 means that no dropout is applied, while a higher rate (e.g., 0.5) would mean that 50% of the units are randomly set to zero during each training step.
|
||||
# Applying dropout can improve the generalization of the model to new, unseen data.
|
||||
|
||||
LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"
|
||||
# Specifies the modules within the model to which Low-Rank Adaptation (LoRA) will be applied.
|
||||
# LoRA is a technique used to fine-tune large language models efficiently by introducing trainable low-rank matrices into specific layers.
|
||||
# The target modules listed here are key components of the transformer architecture, including:
|
||||
# - `c_proj`: The projection layer in the attention mechanism.
|
||||
# - `c_attn`: The attention layer that computes the attention scores.
|
||||
# - `q_attn`: The query part of the attention mechanism.
|
||||
# - `c_fc`: The feedforward layer in the transformer block.
|
||||
# - `c_proj`: Another projection layer, often used in the output of the attention mechanism.
|
||||
# By applying LoRA to these modules, the model can adapt to new tasks with fewer parameters, reducing computational costs and memory requirements.
|
||||
|
||||
# bitsandbytes config
|
||||
USE_NESTED_QUANT = True
|
||||
# Enable nested quantization to enhance computational efficiency.
|
||||
# Nested quantization is a technique that reduces the memory footprint of a model by quantizing weights to lower bit-widths.
|
||||
# This process involves converting floating-point numbers to lower precision formats, such as 4-bit integers.
|
||||
# By using nested quantization, the model can be trained or run with significantly less memory,
|
||||
# allowing for faster computations and the ability to handle larger models on limited hardware.
|
||||
# This is particularly useful in scenarios where computational resources are constrained.
|
||||
|
||||
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"
|
||||
# Data type used for computation when applying 4-bit quantization.
|
||||
# Quantization reduces the precision of model weights to lower memory usage and computational requirements.
|
||||
# 4-bit quantization specifically reduces weights to 4 bits, significantly decreasing memory footprint.
|
||||
# The compute data type (bfloat16 in this case) is used for calculations during training or inference,
|
||||
# balancing precision and computational efficiency.
|
||||
|
||||
SEED = 0
|
||||
# Random seed for reproducibility.
|
||||
@ -1,3 +0,0 @@
|
||||
# fine_tuning
|
||||
|
||||
https://huggingface.co/learn/cookbook/fine_tuning_code_llm_on_single_gpu
|
||||
87
Tuner.py
Normal file
87
Tuner.py
Normal file
@ -0,0 +1,87 @@
|
||||
from huggingface_hub import HfApi
|
||||
# main.py
|
||||
from ModelParameter import (
|
||||
MODEL, DATASET, DATA_COLUMN, SEQ_LENGTH, MAX_STEPS, BATCH_SIZE,
|
||||
GR_ACC_STEPS, LR, LR_SCHEDULER_TYPE, WEIGHT_DECAY, NUM_WARMUP_STEPS,
|
||||
EVAL_FREQ, SAVE_FREQ, LOG_FREQ, OUTPUT_DIR, BF16, FP16, FIM_RATE,
|
||||
FIM_SPM_RATE, LORA_R, LORA_ALPHA, LORA_DROPOUT, LORA_TARGET_MODULES,
|
||||
USE_NESTED_QUANT, BNB_4BIT_COMPUTE_DTYPE, SEED
|
||||
)
|
||||
|
||||
# Now you can use these variables in your code
|
||||
print(MODEL) # Output: bigcode/starcoderbase-1b
|
||||
print(SEQ_LENGTH) # Output: 2048
|
||||
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
logging,
|
||||
set_seed,
|
||||
BitsAndBytesConfig,
|
||||
)
|
||||
|
||||
set_seed(SEED)
|
||||
|
||||
# Step 1: Save your Hugging Face API token
|
||||
# You can obtain this from https://huggingface.co/settings/tokens
|
||||
api_token = "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"
|
||||
from huggingface_hub import HfFolder
|
||||
HfFolder.save_token(api_token)
|
||||
|
||||
# Step 2: Use the authenticated client to interact with the model hub
|
||||
hf_api = HfApi()
|
||||
|
||||
# Example: List models you've uploaded by specifying your username in the search query
|
||||
user_name = "tobjend" # Replace with your Hugging Face username
|
||||
|
||||
# Using filters to get only models from a specific user
|
||||
models = hf_api.list_models(author=user_name)
|
||||
# Print me the info from the first model but only if it's actually there
|
||||
|
||||
from datasets import load_dataset
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
# Begin by loading the data. As the dataset is likely to be quite large, make sure to enable the streaming
|
||||
# mode. Streaming allows us to load the data progressively as we iterate over the dataset instead of
|
||||
# downloading the whole dataset at once.
|
||||
dataset = load_dataset(
|
||||
DATASET,
|
||||
data_dir="data",
|
||||
split="train",
|
||||
streaming=True,
|
||||
)
|
||||
|
||||
valid_data = dataset.take(4000)
|
||||
train_data = dataset.skip(4000)
|
||||
train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
|
||||
|
||||
# At this step, the dataset still contains raw data with code of arbitraty length. For training, we need inputs
|
||||
# of fixed length. Let’s create an Iterable dataset that would return constant-length chunks of tokens from
|
||||
# a stream of text files.
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
|
||||
|
||||
|
||||
def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
|
||||
"""
|
||||
Estimate the average number of characters per token in the dataset.
|
||||
The character-to-token ratio can also be used as an indicator of the quality of text tokenization.
|
||||
For instance, a character-to-token ratio of 1.0 would mean that each character is represented with a token,
|
||||
which is not very meaningful. This would indicate poor tokenization.
|
||||
In standard English text, one token is typically equivalent to approximately four characters, meaning the
|
||||
character-to-token ratio is around 4.0. We can expect a lower ratio in the code dataset, but generally speaking,
|
||||
a number between 2.0 and 3.5 can be considered good enough.
|
||||
"""
|
||||
|
||||
total_characters, total_tokens = 0, 0
|
||||
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
|
||||
total_characters += len(example[data_column])
|
||||
total_tokens += len(tokenizer(example[data_column]).tokens())
|
||||
|
||||
return total_characters / total_tokens
|
||||
|
||||
chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
|
||||
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user