Skip to content

Commit 1a8c9de

Browse files
Merge pull request #359 from Tarun-Sreepada/main
apriori update
2 parents 28cd4af + c87a93d commit 1a8c9de

File tree

2 files changed

+151
-67
lines changed

2 files changed

+151
-67
lines changed

PAMI/frequentPattern/basic/Apriori.py

+149-65
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from PAMI.frequentPattern.basic import abstract as _ab
5555
from typing import List, Dict, Tuple, Set, Union, Any, Generator
5656
from deprecated import deprecated
57+
import numpy as np
5758

5859
class Apriori(_ab._frequentPatterns):
5960
"""
@@ -221,56 +222,9 @@ def _convert(self, value: Union[int, float, str]) -> Union[int, float]:
221222
else:
222223
value = int(value)
223224
return value
224-
225-
def _candidateToFrequent(self, candidateList: List[set]) -> Dict[frozenset, int]:
226-
"""
227-
Generates frequent patterns from the candidate patterns
228-
229-
:param candidateList: Candidate pattern will be given as input
230-
231-
:type candidateList: list
232-
233-
:return: returning set of all frequent patterns
234-
235-
:rtype: dict
236-
237-
"""
238-
239-
candidateToFrequentList = {}
240-
for i in self._Database:
241-
dictionary = {frozenset(j): int(candidateToFrequentList.get(frozenset(j), 0)) + 1 for j in candidateList if
242-
j.issubset(i)}
243-
candidateToFrequentList.update(dictionary)
244-
candidateToFrequentList = {key: value for key, value in candidateToFrequentList.items() if
245-
value >= self._minSup}
246-
247-
return candidateToFrequentList
248-
249-
@staticmethod
250-
def _frequentToCandidate(frequentList: Dict[frozenset, int], length: int) -> List[set]:
251-
"""
252-
253-
Generates candidate patterns from the frequent patterns
254-
255-
:param frequentList: set of all frequent patterns to generate candidate patterns of each of size is length
256-
257-
:type frequentList: dict
258-
259-
:param length: size of each candidate patterns to be generated
260-
261-
:type length: int
262-
263-
:return: set of candidate patterns in sorted order
264-
265-
:rtype: list
266-
267-
"""
268-
269-
frequentToCandidateList = []
270-
for i in frequentList:
271-
nextList = [i | j for j in frequentList if len(i | j) == length and (i | j) not in frequentToCandidateList]
272-
frequentToCandidateList.extend(nextList)
273-
return sorted(frequentToCandidateList)
225+
226+
def _lowMemory(self) -> None:
227+
print()
274228

275229
@deprecated("It is recommended to use 'mine()' instead of 'startMine()' for mining process. Starting from January 2025, 'startMine()' will be completely terminated.")
276230
def startMine(self) -> None:
@@ -303,30 +257,130 @@ def startMine(self) -> None:
303257
self._memoryRSS = process.memory_info().rss
304258
print("Frequent patterns were generated successfully using Apriori algorithm ")
305259

260+
def bitPacker(self, data, maxIndex):
261+
packed_bits = 0
262+
for i in data:
263+
packed_bits |= 1 << (maxIndex - i)
264+
265+
return packed_bits
266+
267+
# @profile
268+
def mineLowMemory(self) -> None:
269+
"""
270+
Frequent pattern mining process will start from here
271+
# Bitset implementation
272+
"""
273+
self._startTime = _ab._time.time()
274+
275+
self._Database = []
276+
277+
self._creatingItemSets()
278+
279+
self._minSup = self._convert(self._minSup)
280+
281+
items = {}
282+
index = 0
283+
for line in self._Database:
284+
for item in line:
285+
if tuple([item]) in items:
286+
items[tuple([item])].append(index)
287+
else:
288+
items[tuple([item])] = [index]
289+
index += 1
290+
291+
# sort by length in descending order
292+
items = dict(sorted(items.items(), key=lambda x: len(x[1]), reverse=True))
293+
cands = []
294+
for key in items:
295+
if len(items[key]) >= self._minSup:
296+
self._finalPatterns[key] = len(items[key])
297+
cands.append(key)
298+
items[key] = self.bitPacker(items[key], index)
299+
else:
300+
break
301+
302+
while cands:
303+
newCands = []
304+
for i in range(len(cands)):
305+
for j in range(i + 1, len(cands)):
306+
if cands[i][:-1] == cands[j][:-1]:
307+
newCand = tuple(cands[i] + tuple([cands[j][-1]]))
308+
intersection = items[tuple([newCand[0]])]
309+
for k in range(1, len(newCand)):
310+
intersection &= items[tuple([newCand[k]])]
311+
count = int.bit_count(intersection)
312+
if count >= self._minSup:
313+
# items[newCand] = intersection
314+
newCands.append(newCand)
315+
self._finalPatterns[newCand] = count
316+
else:
317+
break
318+
319+
cands = newCands
320+
321+
self._endTime = _ab._time.time()
322+
process = _ab._psutil.Process(_ab._os.getpid())
323+
self._memoryUSS = float()
324+
self._memoryRSS = float()
325+
self._memoryUSS = process.memory_full_info().uss
326+
self._memoryRSS = process.memory_info().rss
327+
print("Frequent patterns were generated successfully using Apriori algorithm ")
328+
306329
def mine(self) -> None:
307330
"""
308331
Frequent pattern mining process will start from here
309332
"""
310333
self._Database = []
311334
self._startTime = _ab._time.time()
335+
312336
self._creatingItemSets()
313-
itemsList = sorted(list(set.union(*self._Database))) # because Database is list
314-
items = [{i} for i in itemsList]
315-
itemsCount = len(items)
337+
316338
self._minSup = self._convert(self._minSup)
317-
self._finalPatterns = {}
318-
for i in range(1, itemsCount):
319-
frequentSet = self._candidateToFrequent(items)
320-
for x, y in frequentSet.items():
321-
sample = str()
322-
for k in x:
323-
sample = sample + k + "\t"
324-
self._finalPatterns[sample] = y
325-
items = self._frequentToCandidate(frequentSet, i + 1)
326-
if len(items) == 0:
327-
break # finish apriori
328-
self._endTime = _ab._time.time()
339+
340+
items = {}
341+
index = 0
342+
for line in self._Database:
343+
for item in line:
344+
if tuple([item]) in items:
345+
items[tuple([item])].append(index)
346+
else:
347+
items[tuple([item])] = [index]
348+
index += 1
349+
350+
# sort by length in descending order
351+
items = dict(sorted(items.items(), key=lambda x: len(x[1]), reverse=True))
352+
353+
cands = []
354+
fileData = {}
355+
for key in items:
356+
if len(items[key]) >= self._minSup:
357+
cands.append(tuple([key]))
358+
self._finalPatterns[tuple([key])] = len(items[key])
359+
fileData[tuple([key])] = set(items[key])
360+
else:
361+
break
362+
363+
while cands:
364+
newKeys = []
365+
for i in range(len(cands)):
366+
for j in range(i+1, len(cands)):
367+
if cands[i][:-1] == cands[j][:-1]:
368+
newCand = tuple(cands[i] + tuple([cands[j][-1]]))
369+
intersection = fileData[tuple([newCand[0]])]
370+
for k in range(1, len(newCand)):
371+
intersection = intersection.intersection(fileData[tuple([newCand[k]])])
372+
373+
# intersection = fileData[cands[i]].intersection(fileData[cands[j]])
374+
if len(intersection) >= self._minSup:
375+
# fileData[newCand] = intersection
376+
newKeys.append(newCand)
377+
self._finalPatterns[newCand] = len(intersection)
378+
del cands
379+
cands = newKeys
380+
del newKeys
381+
329382
process = _ab._psutil.Process(_ab._os.getpid())
383+
self._endTime = _ab._time.time()
330384
self._memoryUSS = float()
331385
self._memoryRSS = float()
332386
self._memoryUSS = process.memory_full_info().uss
@@ -448,3 +502,33 @@ def printResults(self) -> None:
448502
else:
449503
print("Error! The number of input parameters do not match the total number of parameters provided")
450504

505+
506+
minUtils = [150]
507+
508+
for minUtil in minUtils:
509+
file = "/Users/tarunsreepada/Downloads/Transactional_T10I4D100K.csv"
510+
obj = Apriori(file, minUtil, sep='\t')
511+
obj.mineLowMemory()
512+
# obj.mine()
513+
print("Total number of Frequent Patterns:", len(obj.getPatterns()))
514+
print("Total Memory in USS:", obj.getMemoryUSS())
515+
print("Total Memory in RSS", obj.getMemoryRSS())
516+
print("Total ExecutionTime in seconds:", obj.getRuntime())
517+
518+
# print()
519+
520+
# obj.mine()
521+
# print("Total number of Frequent Patterns:", len(obj.getPatterns()))
522+
# print("Total Memory in USS:", obj.getMemoryUSS())
523+
# print("Total Memory in RSS", obj.getMemoryRSS())
524+
# print("Total ExecutionTime in seconds:", obj.getRuntime())
525+
526+
# print()
527+
528+
obj = Apriori(file, minUtil, sep='\t')
529+
obj.mine()
530+
# obj.mine()
531+
print("Total number of Frequent Patterns:", len(obj.getPatterns()))
532+
print("Total Memory in USS:", obj.getMemoryUSS())
533+
print("Total Memory in RSS", obj.getMemoryRSS())
534+
print("Total ExecutionTime in seconds:", obj.getRuntime())

PAMI/frequentPattern/pyspark/parallelFPGrowth.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def startMine(self):
272272
sc = _SparkContext(conf=conf)
273273

274274
rdd = sc.textFile(self._iFile, self._numPartitions)\
275-
.map(lambda x: x.rstrip().split('\t'))\
275+
.map(lambda x: x.rstrip().split(self._sep))\
276276
.persist()
277277

278278
self._lno = rdd.count()
@@ -315,7 +315,7 @@ def mine(self):
315315
sc = _SparkContext(conf=conf)
316316

317317
rdd = sc.textFile(self._iFile, self._numPartitions)\
318-
.map(lambda x: x.rstrip().split('\t'))\
318+
.map(lambda x: x.rstrip().split(self._sep))\
319319
.persist()
320320

321321
self._lno = rdd.count()

0 commit comments

Comments
 (0)