forked from jedbrown/git-fat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgit-fat
executable file
·886 lines (850 loc) · 41.1 KB
/
git-fat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
#!/usr/bin/env python
# -*- mode:python -*-
from __future__ import print_function, with_statement
import sys
import hashlib
import tempfile
import os
import fnmatch
import filecmp
import subprocess
import shlex
import shutil
import itertools
import threading
import time
import collections
VERSION = "1.0"
if not type(sys.version_info) is tuple and sys.version_info.major > 2:
sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n')
sys.exit(1)
try:
from subprocess import check_output
del check_output
except ImportError:
def backport_check_output(*popenargs, **kwargs):
r"""Run command with arguments and return its output as a byte string.
Backported from Python 2.7 as it's implemented as pure python on stdlib.
>>> check_output(['/usr/bin/python', '--version'])
Python 2.6.2
"""
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
error = subprocess.CalledProcessError(retcode, cmd)
error.output = output
raise error
return output
subprocess.check_output = backport_check_output
BLOCK_SIZE = 4096
def verbose_stderr(*args, **kwargs):
return print(*args, file=sys.stderr, **kwargs)
def verbose_ignore(*args, **kwargs):
pass
def mkdir_p(path):
import errno
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def umask():
"""Get umask without changing it."""
old = os.umask(0)
os.umask(old)
return old
def readblocks(stream):
bytes = 0
while True:
data = stream.read(BLOCK_SIZE)
bytes += len(data)
if not data:
break
yield data
def cat_iter(initer, outstream):
for block in initer:
outstream.write(block)
def cat(instream, outstream):
return cat_iter(readblocks(instream), outstream)
def difftreez_reader(input):
"""Incremental reader for git diff-tree -z output
:oldmode newmode oldsha1 newsha1 modflag\0filename\0:oldmode newmode ...
"""
buffer = []
partial = ''
while True:
newread = input.read(BLOCK_SIZE)
if not newread:
break
partial += newread
while True:
head, sep, partial = partial.partition('\0')
if not sep:
partial = head
break
buffer.append(head)
if len(buffer) == 2:
oldmode, newmode, oldhash, newhash, modflag = buffer[0].split()
path = buffer[1]
yield (newhash, modflag, path)
buffer = []
def gitconfig_get(name, file=None):
args = ['git', 'config', '--get']
if file is not None:
args += ['--file', file]
args.append(name)
p = subprocess.Popen(args, stdout=subprocess.PIPE)
output = p.communicate()[0].strip()
if p.returncode and file is None:
return None
elif p.returncode:
return gitconfig_get(name)
else:
return output
def gitconfig_set(name, value, file=None):
args = ['git', 'config']
if file is not None:
args += ['--file', file]
args += [name, value]
p = subprocess.check_call(args)
class GitFat(object):
DecodeError = RuntimeError
ConfigError = RuntimeError
def __init__(self):
self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore
try:
self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip()
except subprocess.CalledProcessError:
sys.exit(1)
self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip()
self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
if os.environ.get('GIT_FAT_VERSION') == '1':
self.encode = self.encode_v1
else:
self.encode = self.encode_v2
def magiclen(enc):
return len(enc(hashlib.sha1('dummy').hexdigest(), 5))
self.magiclen = magiclen(self.encode) # Current version
self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions
def setup(self):
mkdir_p(self.objdir)
def is_init_done(self):
return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge')
def assert_init_done(self):
if not self.is_init_done():
sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n')
sys.stderr.write('Run "git fat init" to configure.\n')
sys.exit(1)
self.fat_init_all() # Upgrade old git-fat setup to the latest one
def get_fat_config(self):
return os.path.join(self.gitroot,'.gitfat')
def get_fat_rsync_dirs(self):
cfgpath = self.get_fat_config()
remote = gitconfig_get('rsync.remote', file=cfgpath)
if remote is None:
raise GitFat.ConfigError('No rsync.remote in %s' % cfgpath)
share = gitconfig_get('rsync.share', file=cfgpath)
if share is None:
share = gitconfig_get('rsync.local', file=cfgpath)
if share is None and os.path.exists(remote):
share = remote
if share is None:
share = self.objdir
return remote, share
def get_fat_rsync_ssh(self):
cfgpath = self.get_fat_config()
ssh_port = gitconfig_get('rsync.sshport', file=cfgpath)
ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath)
options = gitconfig_get('rsync.options', file=cfgpath)
return ssh_port, ssh_user, options
def get_rsync_command(self,src,dst,usessh=True):
cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-']
(ssh_port, ssh_user, options) = self.get_fat_rsync_ssh()
if usessh:
rshopts = ''
if ssh_user:
rshopts += ' -l ' + ssh_user
if ssh_port:
rshopts += ' -p ' + ssh_port
if rshopts:
cmd.append('--rsh=ssh' + rshopts)
if options:
cmd += options.split(' ')
cmd += [src + '/', dst + '/']
return cmd
def pushpull_to_rsync(self,push,cnt):
(remote, share) = self.get_fat_rsync_dirs()
if push:
src = self.objdir
dst = remote
self.verbose('git-fat pushpull_to_rsync: %d file(s) found to push to %s' % (cnt, remote))
else:
src = remote
dst = share # If share is set up, smudge filter will take care of linking self.objdir to share during merge|rebase step of 'pull', therefore always pull from remote to share here.
self.verbose('git-fat pushpull_to_rsync: %d file(s) found to pull from %s' % (cnt, remote))
return self.get_rsync_command(src, dst)
def symlink_to_share(self, digest):
'Create self.objdir/digest (links) pointing at share/digest if the configuration of share is set up appropriately'
(remote, share) = self.get_fat_rsync_dirs()
if share == self.objdir or not os.path.exists(share): # Do nothing if share is not set up or points at a non-existing path.
return
sharefile = os.path.join(share, digest)
objfile = os.path.join(self.objdir, digest)
if os.path.lexists(objfile):
os.remove(objfile)
os.symlink(sharefile, objfile) # Note that sharefile may not exist, i.e. may be creating a broken symlink. It is OK as we may not have pulled from remote (to share) yet.
def convert_digest_to_symlink(self, files, share):
'Replace self.objdir/digest in files with links pointing at share/digest'
for digest in files:
fat = os.path.join(self.objdir, digest)
sharefile = os.path.join(share, digest)
os.remove(fat)
os.symlink(sharefile, fat)
def revparse(self, revname):
return subprocess.check_output(['git', 'rev-parse', revname]).strip()
def encode_v1(self, digest, bytes):
'Produce legacy representation of file to be stored in repository.'
return '#$# git-fat %s\n' % (digest,)
def encode_v2(self, digest, bytes):
'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.'
return '#$# git-fat %s %20d\n' % (digest, bytes)
def decode(self, string, noraise=False):
cookie = '#$# git-fat '
if string.startswith(cookie):
parts = string[len(cookie):].split()
digest = parts[0]
bytes = int(parts[1]) if len(parts) > 1 else None
return digest, bytes
elif noraise:
return None, None
else:
raise GitFat.DecodeError('Could not decode %s' % (string))
def decode_stream(self, stream):
'Return digest if git-fat cache, otherwise return iterator over entire file contents'
preamble = stream.read(self.magiclen)
try:
return self.decode(preamble)
except GitFat.DecodeError:
# Not sure if this is the right behavior
return itertools.chain([preamble], readblocks(stream)), None
def decode_file(self, fname):
import errno
# Fast check - In case sparse-checkout is used, do not choke on missing files
try:
stat = os.lstat(fname)
except OSError as exc:
if exc.errno == errno.ENOENT:
pass
return False, None
else:
raise
if stat.st_size not in self.magiclens:
return False, None
# read file
try:
digest, bytes = self.decode_stream(open(fname))
except IOError:
return False, None
if isinstance(digest, str):
return digest, bytes
else:
return None, bytes
def decode_clean(self, body):
'''
Attempt to decode version in working tree. The tree version could be changed to have a more
useful message than the machine-readable copy that goes into the repository. If the tree
version decodes successfully, it indicates that the fat data is not currently available in
this repository.
'''
digest, bytes = self.decode(body, noraise=True)
return digest
def filter_clean(self, instream, outstreamclean, args):
h = hashlib.new('sha1')
bytes = 0
fd, tmpname = tempfile.mkstemp(dir=self.objdir)
try:
ishanging = False
cached = False # changes to True when file is cached
with os.fdopen(fd, 'w') as cache:
outstream = cache
blockiter = readblocks(instream)
firstblock = True
for block in readblocks(instream):
if firstblock:
if len(block) == self.magiclen and self.decode_clean(block[0:self.magiclen]):
ishanging = True # Working tree version is verbatim from repository (not smudged)
outstream = outstreamclean
firstblock = False
h.update(block)
bytes += len(block)
outstream.write(block)
outstream.flush()
# Skip empty files
if bytes != 0:
digest = h.hexdigest()
objfile = os.path.join(self.objdir, digest)
if not ishanging:
if os.path.exists(objfile):
self.verbose('git-fat filter-clean: cache already exists %s (referenced by %s)' % (objfile, str(args[0])))
os.remove(tmpname)
else:
# Set permissions for the new file using the current umask
os.chmod(tmpname, int('444', 8) & ~umask())
os.rename(tmpname, objfile)
self.verbose('git-fat filter-clean: caching to %s' % objfile)
cached = True
outstreamclean.write(self.encode(digest, bytes))
finally:
if not cached:
os.remove(tmpname)
def cmd_filter_clean(self, args):
'''
The clean filter runs when a file is added to the index. It gets the "smudged" (tree)
version of the file on stdin and produces the "clean" (repository) version on stdout.
'''
self.cmd_init()
self.filter_clean(sys.stdin, sys.stdout, args)
def cmd_filter_smudge(self, args):
self.cmd_init()
filename = str(args[0])
result, bytes = self.decode_stream(sys.stdin)
if isinstance(result, str): # We got a digest
objfile = os.path.join(self.objdir, result)
if not os.access(objfile, os.R_OK):
self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query share, if available, and try again' % (objfile, filename))
self.symlink_to_share(result)
if not os.access(objfile, os.R_OK):
self.verbose('git-fat filter-smudge: fat object missing %s (required by %s) - will query remote, if available, and try again' % (objfile, filename))
self.pull_from_remote(set([result]))
try:
cat(open(objfile), sys.stdout)
self.verbose('git-fat filter-smudge: restoring from %s (referenced by %s)' % (objfile, filename))
except IOError: # file not found
self.verbose('git-fat filter-smudge: fat object missing %s (required by %s)' % (objfile, filename))
sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file
# We have an non empty iterable over the original input.
elif len(next(result)) != 0:
self.verbose('git-fat filter-smudge: not a managed file (%s)' % filename)
cat_iter(result, sys.stdout)
def catalog_objects(self):
return set(os.listdir(self.objdir))
def referenced_objects(self, rev=None, all=False, quiet=False):
referenced = set()
if all:
rev = '--all'
elif rev is None:
rev = self.revparse('HEAD')
if not quiet:
print(' Finding all fat objects referenced by: %s' % rev)
# Revision list gives us object names to inspect with cat-file...
p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE, close_fds=True)
def cut_sha1hash(input, output):
for line in input:
output.write(line.split()[0] + '\n')
output.close()
# ...`cat-file --batch-check` filters for git-fat object candidates in bulk...
p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
def filter_gitfat_candidates(input, output):
for line in input:
objhash, objtype, size = line.split()
if objtype == 'blob' and int(size) in self.magiclens:
output.write(objhash + '\n')
output.close()
# ...`cat-file --batch` provides full contents of git-fat candidates in bulk
p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
# Stream data: p1 | cut_thread | p2 | filter_thread | p3
cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin))
filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin))
cut_thread.start()
filter_thread.start()
# Process metadata + content format provided by `cat-file --batch`
while True:
metadata_line = p3.stdout.readline()
if not metadata_line:
break # EOF
objhash, objtype, size_str = metadata_line.split()
size, bytes_read = int(size_str), 0
# We know from filter that item is a candidate git-fat object and
# is small enough to read into memory and process
content = ''
while bytes_read < size:
data = p3.stdout.read(size - bytes_read)
if not data:
break # EOF
content += data
bytes_read += len(data)
try:
fathash = self.decode(content)[0]
referenced.add(fathash)
except GitFat.DecodeError:
pass
# Consume LF record delimiter in `cat-file --batch` output
bytes_read = 0
while bytes_read < 1:
data = p3.stdout.read(1)
if not data:
break # EOF
bytes_read += len(data)
# Ensure everything is cleaned up
cut_thread.join()
filter_thread.join()
p1.wait()
p2.wait()
p3.wait()
return referenced
def orphan_files(self, patterns=[], quiet=False):
'generator for all orphan placeholders in the working tree'
if not quiet:
print(' Finding all orphan objects:')
for fname in subprocess.check_output(['git', 'ls-files', '-z'] + patterns).split('\x00')[:-1]:
digest = self.decode_file(fname)[0]
if digest:
yield (digest, fname)
def fat_files(self):
fatfiles = set()
for fatfile in self.catalog_objects():
if fatfile != '' and not os.path.islink(os.path.join(self.objdir, fatfile)):
fatfiles.add(fatfile)
return fatfiles
def cmd_status(self, args):
self.setup()
catalog = self.catalog_objects()
refargs = dict()
if '--all' in args:
refargs['all'] = True
referenced = self.referenced_objects(**refargs)
garbage = catalog - referenced
# TODO: Why is the orphans computed this way as opposed to calling self.orphan_files?
orphans = referenced - catalog
if '--all' in args:
for obj in referenced:
print(obj)
if orphans:
print('Orphan objects:')
for orph in orphans:
print(' ' + orph)
if garbage:
print('Garbage objects:')
for g in garbage:
print(' ' + g)
def is_dirty(self):
return subprocess.call(['git', 'diff-index', '--quiet', 'HEAD']) == 0
def push_to_remote(self, files):
if len(files) == 0:
return
cmd = self.pushpull_to_rsync(push=True, cnt=len(files))
self.verbose('Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
if p.returncode:
sys.exit(p.returncode)
def push_to_share(self, files):
(remote, share) = self.get_fat_rsync_dirs()
# Do nothing if share is not set up or points at a non-existing path.
if share == self.objdir or not os.path.exists(share):
return
if len(files) == 0:
return
cmd = self.get_rsync_command(self.objdir, share, usessh=False) # ssh parameters do not apply to share. They are for remote only.
self.verbose('git-fat push to share: Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
p.communicate(input='\x00'.join(files))
if p.returncode:
sys.exit(p.returncode)
self.convert_digest_to_symlink(files, share)
def git_remote_exists(self):
args = ['git', 'remote']
p = subprocess.Popen(args, stdout=subprocess.PIPE)
output = p.communicate()[0].strip()
if p.returncode or not output:
return False
else:
return True
def cmd_pre_push(self, args):
self.cmd_init()
if self.git_remote_exists():
self.cmd_push("pre")
def cmd_push(self, args):
'Push all fat files that I have stored and referenced'
self.setup()
# Default to push only those objects referenced by current HEAD (includes history)
pre = 'pre' in args
pushall = '--all' in args
files = self.referenced_objects(all=pushall) & self.fat_files()
self.push_to_remote(files)
self.push_to_share(files)
if not pre:
self.git_push(all=pushall)
def git_push(self, all=None):
cmd = ['git', 'push']
if all:
cmd.append('--all')
print('Running ' + ' '.join(cmd) + ' ...')
try:
sys.stdout.write( subprocess.check_output(cmd) )
sys.stdout.flush()
except subprocess.CalledProcessError, e:
raise GitFat.PushError('Failed when pushing to remote git repo - Exit code: %d\n%s' % (e.returncode, e.output))
def checkout(self, show_orphans=False):
'Update any stale files in the present working tree'
self.assert_init_done()
for digest, fname in self.orphan_files():
objpath = os.path.join(self.objdir, digest)
if not os.access(objpath, os.R_OK):
self.symlink_to_share(digest)
if not os.access(objpath, os.R_OK):
self.pull_from_remote(set([digest]))
if os.access(objpath, os.R_OK):
print('Restoring %s -> %s' % (digest, fname))
# The output of our smudge filter depends on the existence of
# the file in .git/fat/objects, but git caches the file stat
# from the previous time the file was smudged, therefore it
# won't try to re-smudge. I don't know a git command that
# specifically invalidates that cache, but touching the file
# also does the trick.
os.utime(fname, None)
# This re-smudge is essentially a copy that restores permissions.
# TODO: Find a way to fix the following bug - If fname is modified
# by copying a valid another self.magiclen-byte long file, the
# following command would replace it (fname) with the fat file
# that the committed version of fname refers to rather than that
# other self.magiclen-byte long file that got copied over.
# The reason for that is obvious: checkout-index --index retrieves
# that last committed version of fname, and the smudge naturally
# pulls in what that committed fname references rather than ...
subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname])
elif show_orphans:
print('Data unavailable: %s %s' % (digest,fname))
def pull_from_remote(self, files):
'Since this sub is also used by cmd_filter_smudge, stdout needs to be nothing but what git expects => throw away stdout of rsync'
if len(files) == 0:
return
cmd = self.pushpull_to_rsync(push=False, cnt=len(files))
self.verbose('git-fat pull: Executing: %s' % ' '.join(cmd))
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdoutdata = p.communicate(input='\x00'.join(files))
if p.returncode:
sys.exit(p.returncode)
def cmd_post_merge(self, args):
self.cmd_init()
if self.git_remote_exists():
self.cmd_pull("")
def cmd_pre_rebase(self, args):
self.cmd_init()
if self.git_remote_exists():
self.cmd_pull("")
def cmd_pull(self, args):
'Pull anything that I have referenced, but not stored'
self.setup()
refargs = dict()
if '--all' in args:
refargs['all'] = True
for arg in args:
if arg.startswith('-') or len(arg) != 40:
continue
rev = self.revparse(arg)
if rev:
refargs['rev'] = rev
#print('Determining fat files to pull...')
files = self.filter_objects(refargs, self.parse_pull_patterns(args))
self.pull_from_remote(files)
self.checkout()
def parse_pull_patterns(self, args):
if '--' not in args:
return ['']
else:
idx = args.index('--')
patterns = args[idx+1:] #we don't care about '--'
return patterns
def filter_objects(self, refargs, patterns):
files = self.referenced_objects(**refargs) - self.catalog_objects()
if refargs.get('all'): # Currently ignores patterns; can we efficiently do both?
return files
# TODO: Based on how orphans are computed in self.cmd_status, isn't the following a no-op?
# In other words, 'files & orphans_objects' is equal to 'files' because files is computed
# above to be 'ref - catalog', and that's exactly how cmd_status computes its orphan. So,?
orphans_matched = list(self.orphan_files(patterns))
orphans_objects = set(map(lambda x: x[0], orphans_matched))
return files & orphans_objects
def cmd_checkout(self, args):
self.checkout(show_orphans=True)
def cmd_gc(self):
garbage = self.catalog_objects() - self.referenced_objects()
print('Unreferenced objects to remove: %d' % len(garbage))
for obj in garbage:
fname = os.path.join(self.objdir, obj)
print('%10d %s' % (os.stat(fname).st_size, obj))
os.remove(fname)
def cmd_verify(self):
"""Print details of git-fat objects with incorrect data hash"""
corrupted_objects = []
for obj in self.catalog_objects(quiet=True):
fname = os.path.join(self.objdir, obj)
h = hashlib.new('sha1')
for block in readblocks(open(fname)):
h.update(block)
data_hash = h.hexdigest()
if obj != data_hash:
corrupted_objects.append((obj, data_hash))
if corrupted_objects:
print('Corrupted objects: %d' % len(corrupted_objects))
for obj, data_hash in corrupted_objects:
print('%s data hash is %s' % (obj, data_hash))
sys.exit(1)
def fat_init_one(self, var, value):
value_cur = gitconfig_get(var)
if value_cur is None or value_cur != value:
gitconfig_set(var, value)
return True
return False
def fat_init_all(self):
ret = False
ret = self.fat_init_one('filter.fat.clean', 'git-fat filter-clean %f') or ret
ret = self.fat_init_one('filter.fat.smudge', 'git-fat filter-smudge %f') or ret
ret = self.fat_init_one('filter.fat.required', 'true') or ret
post_merge = os.path.join(self.gitdir, 'hooks', 'post-merge')
if not os.path.isfile(post_merge):
with open(post_merge, "w") as f:
lines = ["#!/bin/sh -ex\n", "git fat post-merge \"$@\"\n"]
f.writelines(lines)
os.chmod(post_merge, 0755)
ret = True
pre_rebase = os.path.join(self.gitdir, 'hooks', 'pre-rebase')
if not os.path.isfile(pre_rebase):
with open(pre_rebase, "w") as f:
lines = ["#!/bin/sh -ex\n", "git fat pre-rebase \"$@\"\n"]
f.writelines(lines)
os.chmod(pre_rebase, 0755)
ret = True
pre_push = os.path.join(self.gitdir, 'hooks', 'pre-push')
if not os.path.isfile(pre_push):
with open(pre_push, "w") as f:
lines = ["#!/bin/sh -ex\n", "git fat pre-push \"$@\"\n"]
f.writelines(lines)
os.chmod(pre_push, 0755)
ret = True
return ret
def cmd_init(self, quiet=False):
self.setup()
if self.fat_init_all() is True:
#self.cmd_post_merge("")
if quiet is False:
print('Initialized git fat')
def gen_large_blobs(self, revs, threshsize):
"""Build dict of all blobs"""
time0 = time.time()
def hash_only(input, output):
"""The output of git rev-list --objects shows extra info for blobs, subdirectory trees, and tags.
This truncates to one hash per line.
"""
for line in input:
output.write(line[:40] + '\n')
output.close()
revlist = subprocess.Popen(['git', 'rev-list', '--all', '--objects'], stdout=subprocess.PIPE, bufsize=-1, close_fds=True)
objcheck = subprocess.Popen(['git', 'cat-file', '--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=-1, close_fds=True)
hashonly = threading.Thread(target=hash_only, args=(revlist.stdout, objcheck.stdin))
hashonly.start()
numblobs = 0; numlarge = 1
# Build dict with the sizes of all large blobs
for line in objcheck.stdout:
objhash, blob, size = line.split()
if blob != 'blob':
continue
size = int(size)
numblobs += 1
if size > threshsize:
numlarge += 1
yield objhash, size
revlist.wait()
objcheck.wait()
hashonly.join()
time1 = time.time()
self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0))
def cmd_find(self, args):
maxsize = int(args[0])
blobsizes = dict(self.gen_large_blobs('--all', maxsize))
time0 = time.time()
# Find all names assumed by large blobs (those in blobsizes)
pathsizes = collections.defaultdict(lambda:set())
revlist = subprocess.Popen(['git', 'rev-list', '--all'], stdout=subprocess.PIPE, bufsize=-1, close_fds=True)
difftree = subprocess.Popen(['git', 'diff-tree', '--root', '--no-renames', '--no-commit-id', '--diff-filter=AMCR', '-r', '--stdin', '-z'],
stdin=revlist.stdout, stdout=subprocess.PIPE, close_fds=True)
for newblob, modflag, path in difftreez_reader(difftree.stdout):
bsize = blobsizes.get(newblob)
if bsize: # We care about this blob
pathsizes[path].add(bsize)
time1 = time.time()
self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0))
maxlen = max(map(len,pathsizes)) if pathsizes else 0
for path, sizes in sorted(pathsizes.items(), key=lambda p,s: max(s), reverse=True):
print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes)))
revlist.wait()
difftree.wait()
def cmd_index_filter(self, args):
manage_gitattributes = '--manage-gitattributes' in args
filelist = set(f.strip() for f in open(args[0]).readlines())
lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE, close_fds=True)
updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE, close_fds=True)
for line in lsfiles.stdout:
mode, sep, tail = line.partition(' ')
blobhash, sep, tail = tail.partition(' ')
stageno, sep, tail = tail.partition('\t')
filename = tail.strip()
infilelist = False
for pattern in filelist:
if fnmatch.fnmatch(filename, pattern):
infilelist = True
break
if not infilelist:
continue
if mode == "120000":
# skip symbolic links
continue
# This file will contain the hash of the cleaned object
hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash)
try:
cleanedobj = open(hashfile).read().rstrip()
except IOError:
catfile = subprocess.Popen(['git', 'cat-file', 'blob', blobhash], stdout=subprocess.PIPE, close_fds=True)
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
def dofilter():
self.filter_clean(catfile.stdout, hashobject.stdin, filename)
hashobject.stdin.close()
filterclean = threading.Thread(target=dofilter)
filterclean.start()
cleanedobj = hashobject.stdout.read().rstrip()
catfile.wait()
hashobject.wait()
filterclean.join()
mkdir_p(os.path.dirname(hashfile))
open(hashfile, 'w').write(cleanedobj + '\n')
updateindex.stdin.write('%s %s %s\t%s\n' % (mode, cleanedobj, stageno, filename))
if manage_gitattributes:
try:
mode, blobsha1, stageno, filename = subprocess.check_output(['git', 'ls-files', '-s', '.gitattributes']).split()
gitattributes_lines = subprocess.check_output(['git', 'cat-file', 'blob', blobsha1]).splitlines()
except ValueError: # Nothing to unpack, thus no file
mode, stageno = '100644', '0'
gitattributes_lines = []
gitattributes_extra = ['%s filter=fat -text' % line.split()[0] for line in filelist]
hashobject = subprocess.Popen(['git', 'hash-object', '-w', '--stdin'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, stderr = hashobject.communicate('\n'.join(gitattributes_lines + gitattributes_extra) + '\n')
updateindex.stdin.write('%s %s %s\t%s\n' % (mode, stdout.strip(), stageno, '.gitattributes'))
updateindex.stdin.close()
lsfiles.wait()
updateindex.wait()
def cmd_help(self):
objdir = os.path.join(self.gitroot, self.objdir)
# Directories
print('Git-fat version: '+VERSION)
print('Directories used by git-fat:')
print('- objdir : Contains fat objects(files and/or shared links). Shared links will only exist if \'share\' is configured.')
print(' (' + objdir + ')')
try:
(remote, share) = self.get_fat_rsync_dirs()
except GitFat.ConfigError:
(remote, share) = ('', objdir)
pass
print('- share : Directory containing pushed out fat files.')
print(' To increase performance, you are recommended to set this spot to be on a local NAS on your site.')
print(' This directory is shared across all your repos/wcps as well as by your peers if they are using it.')
print(' Setting up this directory offers disk space saving as well as allows fine grained push operation => faster push performance.')
print(' If this configuration option is not set up, its value defaults to remote if it is a directory or \'objdir\'.')
print(' (' + share + ')')
print('- remote : Rsync destination containing pushed out fat files.')
print(' This rsync destination is where everyone who uses this repo pushes their fat files onto.')
print(' (' + remote + ')')
print('share and remote are configured via ' + self.get_fat_config())
print()
# Definitions
print('Definitions used by git-fat:')
print('- reference objects : List of all fat objects referenced by your working copy. These named files are expected to exist in \'objdir\'.')
print('- catalog objects : List of all fat objects in \'objdir\'')
print('- orphan objects : reference - catalog (subtraction)')
print('- garbage objects : catalog - reference (subtraction)')
print()
# Operation
print('Two primary functions of git-fat are clean and smudge filters that git invokes as necessary:')
print('- filter-clean : (large) file content (input) => translated (small) reference file (output)')
print('- Creates the fat object in \'objdir/...\' using the (large) file content. Its name is based on its SHA1.')
print('- filter-smudge : (small) reference file (stdin) => recovered (large) file content (stdout)')
print(' Creates a shared link: \'objdir/...\' -> \'share/...\' for the (large) file (name is based on its SHA1). Bypassed if \'objdir/...\' already exists.')
print(' If \'objdir/...\' is broken, it brings in the (large) file from \'remote\' to \'share\' ==> recovers the file.')
print('')
print('Additional useful functions offered by git-fat are:')
print('- git fat status : Prints orphan and garbage objects')
print('- git fat checkout : Converts all orphan objects into non-orphan state, while automatically executing \'pull\'-like functionality for the specific orphan file.')
print('- git fat gc : Deletes all garbage objects')
print('- git fat verify : Report corrupt fat objects in the catalog')
print('- More info? : Define export var GIT_FAT_VERBOSE and continue using git-fat.')
print('')
print('Typical git operations, when is git-fat involved and what it does when it is invoked:')
print('- git clone ... : See git checkout.')
print('- git fetch : git-fat is not involved.')
print('- git pull : Runs git fat pull via post-merge or pre-rebase githook')
print(' Brings in data for orphan objects, computed per HEAD (including history) of your working copy, from \'remote\' to \'share\'.')
print(' Creates a sym link: \'objdir/...\' -> \'share/...\' for each orphan object that HEAD points at (no history) ==> No longer orphan.')
print(' Lets git invoke git-fat\'s filter-smudge function')
print('- git fat pull --all : Same as git fat pull except that the orphan objects are computed across all git objects,')
print(' not just per what HEAD (including history) of your working copy.')
print('- git push : Runs git fat push via pre-push githook')
print(' reference & fat files (not sym links), where & is the intersection operation, is pushed out to:')
print(' \'remote\'. Diff the same file set between \'objdir\' and \'remote\'. Abort if mismatches.')
print(' \'share\'. Diff the same file set between \'objdir\' and \'share\'. Abort if mismatches.')
print(' Replaces each such file in \'objdir\' with a sym link, pointing at \'share/...\'.')
print('- git fat push : Runs git fat push directly - for backwards compatability with legacy,')
print(' Runs git push')
print('- git fat push --all : Same steps as git fat push except that reference is computed across all git objects,')
print(' not just what your HEAD (including history) is pointing at.')
print('')
print('- git checkout ... : git invokes git-fat filter-smudge for each file configured in .gitattributes and post-merge githook.')
print('- git add <path/file> : git invokes git-fat filter-clean for each file configured in .gitattributes.')
print('- git commit -a [...] : See git add.')
print('- git merge ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.')
print('- git rebase ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and pre-rebase githook.')
print('- git cherry-pick ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.')
print('- git revert ... : git invokes git-fat filter-clean and filter-smudge for each file configured in .gitattributes and post-merge githook.')
if __name__ == '__main__':
'print("Starting git-fat for file: ", str(sys.argv[2]), file=sys.stderr)'
fat = GitFat()
cmd = sys.argv[1] if len(sys.argv) > 1 else ''
# Make sure init stays quiet unless explicitely called
if cmd != 'init':
fat.cmd_init(True)
if cmd == 'filter-clean':
fat.cmd_filter_clean(sys.argv[2:])
elif cmd == 'filter-smudge':
fat.cmd_filter_smudge(sys.argv[2:])
elif cmd == 'pre-push':
fat.cmd_pre_push(sys.argv[2:])
elif cmd == 'pre-rebase':
fat.cmd_pre_rebase(sys.argv[2:])
elif cmd == 'post-merge':
fat.cmd_post_merge(sys.argv[2:])
elif cmd == 'init':
fat.cmd_init()
elif cmd == 'status':
fat.cmd_status(sys.argv[2:])
elif cmd == 'push':
fat.cmd_push(sys.argv[2:])
elif cmd == 'pull':
fat.cmd_pull(sys.argv[2:])
elif cmd == 'gc':
fat.cmd_gc()
elif cmd == 'verify':
fat.cmd_verify()
elif cmd == 'checkout':
fat.cmd_checkout(sys.argv[2:])
elif cmd == 'find':
fat.cmd_find(sys.argv[2:])
elif cmd == 'index-filter':
fat.cmd_index_filter(sys.argv[2:])
elif cmd == 'help':
fat.cmd_help()
else:
print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter|help]', file=sys.stderr)