From 48aa451320b69d164a67ac86f093e3e5f08a2f6d Mon Sep 17 00:00:00 2001
From: Brian Harring <ferringb@google.com>
Date: Tue, 16 Oct 2012 04:35:33 -0700
Subject: manifest recommit deduplication.

Drops the commit count from ~1070k to 637k
---
 rewrite-commit-dump.py | 69 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index b25a7b7..1df8211 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -65,7 +65,7 @@ mangler.append(functools.partial(
     mangle_portage))
 
 known_footers = ('Package-Manager', 'RepoMan-Options', 'Manifest-Sign-Key')
-fields = ('author', 'committer', 'msg', 'files', 'timestamp', 'footerless_msg')
+fields = ('author', 'msg', 'files', 'timestamp', 'footerless_msg')
 fields_map = dict((attr, idx) for idx, attr in enumerate(fields))
 fake_fields = ('footerless_msg', 'timestamp')
 file_idx = fields_map['files']
@@ -107,7 +107,7 @@ def deserialize_records(source, blob_idx):
         continue
       assert chunks[0] in ('author', 'committer', 'data')
       if chunks[0] != 'data':
-        d[chunks[0]] = intern(chunks[1].strip())
+        d[chunks[0]] = chunks[1].strip()
         continue
       # Process the commit message...
       size = int(chunks[1])
@@ -152,14 +152,18 @@ def deserialize_records(source, blob_idx):
       line = source.readline()
     d['files'] = files
     # Basic sanity check for the code above...
-    assert set(fields).issuperset(d), d
     d.setdefault('author', d.get('committer'))
     assert d['author'] is not None
+    assert d['author'] == d['committer'], d
+    d.pop('committer')
     # Skank the timestamp out...
     chunks = d['author'].rsplit(None, 1)
     assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
-    d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
+    chunks = chunks[0].rsplit(None, 1)
+    d['timestamp'] = long(chunks[1])
+    d['author'] = intern(chunks[0])
     d['footerless_msg'] = record.calculate_footerless_msg(d['msg'])
+    assert set(fields).issuperset(d), d
     yield record(*[d.get(x) for x in fields])
     # Bleh... of course namedtuple doesn't make this easy.
     line = source.readline()
@@ -176,10 +180,13 @@ def serialize_records(records, handle, target='refs/heads/master', progress=5000
     write('mark :%i\n' % idx)
     # fields = ('mark', 'author', 'committer', 'msg', 'files')
     for name, value in zip(fields, record):
-      if name in ('mark', 'author', 'committer'):
-        write("%s %s\n" % (name, value))
-      elif name in fake_fields:
+      if name in fake_fields:
         continue
+      elif name == 'mark':
+        write("%s %s\n" % (name, value))
+      elif name == 'author':
+        val = "%s %i +0000" % (value, record.timestamp)
+        write('author %s\ncommitter %s\n' % (val, val))
       elif name == 'msg':
         write("data %i\n%s" % (len(value), value))
       elif name == 'files':
@@ -211,6 +218,49 @@ def simple_dedup(records):
   l = itertools.imap(operator.itemgetter(0), dupes.itervalues())
   return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0)))
 
+def manifest_dedup(records, backwards=(5*60)):
+  # While searching back 5 minutes is a bit much... it's happened more than one might
+  # think sadly.
+  slots = collections.defaultdict(list)
+  for idx, record in enumerate(records):
+    if len(record.files) != 1:
+      slots[record.timestamp].append((idx, record))
+      continue
+    manifest = record.files.items()[0]
+    # if it's a deletion, we don't care...
+    if not manifest[0].endswith('/Manifest') or manifest[1][0] == 'D':
+      slots[record.timestamp].append((idx, record))
+      continue
+    manifest_dir = os.path.dirname(manifest[0])
+    update = True
+    for timestamp in xrange(record.timestamp, max(0, record.timestamp - backwards), -1):
+      potential = slots.get(timestamp)
+      if potential is None:
+        continue
+      for update_pos, (idx, target) in enumerate(reversed(potential), 1):
+        # while intersecting pathways first is slower... we do it this way so that we can
+        # spot if another author stepped in for a directory- if that occurs, manifest recommit
+        # or not, we shouldn't mangle that history.
+        if all(manifest_dir != os.path.dirname(x) for x in target.files):
+          potential[0 - update_pos] = (idx, target.update_files(record))
+          continue
+        if (target.author == record.author and
+            target.footerless_msg == record.footerless_msg):
+          potential[-update_pos] = (idx, target.update_files(record))
+          # same author/msg; allow the combination.
+          update = False
+        # note if author/msg didn't match, this becomes a forced injection.
+        break
+
+    if update:
+      slots[record.timestamp].append((idx, record))
+  # And... do the collapse.
+  l = []
+  for value in slots.itervalues():
+    l.extend(value)
+  # Sort by idx, but strip idx on the way out.
+  return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0)))
+
 def main(argv):
   records = []
   # Be careful here to just iterate over source; doing so allows this script
@@ -224,10 +274,10 @@ def main(argv):
     if not os.path.exists(commits):
       sys.stderr.write("skipping %s; no commit data\n" % directory)
       continue
-    records.extend(
+    records.extend(manifest_dedup(
       deserialize_records(
         open(commits, 'r'),
-        deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+        deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx'))))
       )
     )
   sorter = operator.attrgetter('timestamp')
@@ -237,6 +287,7 @@ def main(argv):
   # This allows us to combine the history w/out losing the ordering per repo.
   records.sort(key=sorter)
   records[:] = simple_dedup(records)
+#  records[:] = manifest_dedup(records)
   serialize_records(records, sys.stdout)
   return 0
 
-- 
cgit v1.2.3-65-gdbad