-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontinue_training.py
135 lines (102 loc) · 5.82 KB
/
continue_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
!pip install gensim datasets
import logging
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from datasets import load_dataset
# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Load dataset
dataset = load_dataset("anjandash/java-8m-methods-v2", split={'train': 'train', 'validation': 'validation', 'test': 'test'})
# Custom iterator for dataset processing
class DatasetIterator:
def __init__(self, dataset_split, chunk_size=100000):
self.dataset_split = dataset_split
self.chunk_size = chunk_size
self.total_size = len(dataset_split)
self.num_chunks = -(-self.total_size // chunk_size) # Ceiling division to get total number of chunks
def __iter__(self):
start_time = time.time()
for chunk_index in range(self.num_chunks):
start_idx = chunk_index * self.chunk_size
end_idx = start_idx + self.chunk_size
chunk = self.dataset_split.select(range(start_idx, min(end_idx, self.total_size)))
for example in chunk:
words = example['text'].split()
yield TaggedDocument(words=words, tags=[example['id']])
# Calculate and display progress
elapsed_time = time.time() - start_time
chunks_processed = chunk_index + 1
if chunks_processed < self.num_chunks:
remaining_chunks = self.num_chunks - chunks_processed
estimated_total_time = (elapsed_time / chunks_processed) * self.num_chunks
estimated_remaining_time = estimated_total_time - elapsed_time
hours, remainder = divmod(estimated_remaining_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"{chunks_processed}/{self.num_chunks} completed, estimated remaining time: {int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds")
else:
print(f"All chunks processed. Total training time: {elapsed_time:.2f} seconds.")
# Initialize model with correct total_examples count for training
model = Doc2Vec(vector_size=200, window=10, min_count=5, workers=4, epochs=4)
# Build vocabulary as before
# Correctly initializing and using the vocab_iterator
vocab_iterator = DatasetIterator(dataset['train'].select(range(500000)), chunk_size=100000)
# Build the vocabulary
model.build_vocab(vocab_iterator)
# Before training, manually set the model's corpus count to the actual size of your training data
model.corpus_count = len(dataset['train']) + len(dataset['validation']) + int(len(dataset['test']) * 0.9)
# Then, proceed with training using the DatasetIterator
# Ensure train_iterator is correctly set up to iterate over the entire dataset as needed
train_iterator = DatasetIterator(dataset['train'], chunk_size=100000)
model.train(train_iterator, total_examples=model.corpus_count, epochs=model.epochs, start_alpha=model.alpha, end_alpha=model.min_alpha)
'''
# Initialize model
model = Doc2Vec(vector_size=200, window=10, min_count=5, workers=4, epochs=20)
# Prepare the iterator for vocabulary building (using a subset of the data)
vocab_iterator = DatasetIterator(dataset['train'].select(range(50000)), chunk_size=100000) # Adjust based on available memory
# Build vocabulary
model.build_vocab(vocab_iterator)
# Train the model incrementally, displaying progress
train_iterator = DatasetIterator(dataset['train'], chunk_size=100000)
model.train(train_iterator, total_examples=model.corpus_count, epochs=model.epochs, start_alpha=model.alpha, end_alpha=model.min_alpha)
'''
# Save the model
model.save("java_8m_methods_doc2vec.model")
print("Model saved successfully.")
# Evaluation and testing should be done separately to ensure the model's performance
# can be accurately assessed without using the test data during training.
# Here, you would load your test data, preprocess it if necessary,
# and then use the model.infer_vector method to generate vectors for the test documents.
# These vectors can then be used for evaluation purposes, such as similarity comparisons
# or as input features for downstream machine learning tasks.
##################################################################################################################################################
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import logging
# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Assuming your custom dataset is loaded into a DataFrame named `custom_df` with a column named 'code'
# custom_df = pd.read_csv("path/to/your/dataset.csv") # Load your dataset here
# Example custom dataset iterator
class CustomDatasetIterator:
def __init__(self, dataframe, column_name='code'):
self.dataframe = dataframe
self.column_name = column_name
def __iter__(self):
for index, row in self.dataframe.iterrows():
words = row[self.column_name].split() # Splitting the code into words
tags = [index] # Using the row index as a unique tag
yield TaggedDocument(words=words, tags=tags)
# Load your pre-trained model
model_path = "java_8m_methods_doc2vec.model" # Update this path
model = Doc2Vec.load(model_path)
# Prepare your custom dataset for training
custom_dataset_iterator = CustomDatasetIterator(custom_df, 'code')
# Optionally update the model's vocabulary with new words from the custom dataset
# model.build_vocab(custom_dataset_iterator, update=True)
# Update the model's corpus count to include the new documents
model.corpus_count += len(custom_df)
# Continue training the model with the custom dataset
model.train(custom_dataset_iterator, total_examples=model.corpus_count, epochs=model.epochs)
# Save the updated model
model.save("updated_java_8m_methods_doc2vec.model")
print("Model updated and saved successfully.")