| /* |
| * Copyright (C) 2012 the libgit2 contributors |
| * |
| * This file is part of libgit2, distributed under the GNU GPL v2 with |
| * a Linking Exception. For full terms see the included COPYING file. |
| */ |
| #include "common.h" |
| #include "diff.h" |
| #include "git2/config.h" |
| |
| static git_diff_delta *diff_delta__dup( |
| const git_diff_delta *d, git_pool *pool) |
| { |
| git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); |
| if (!delta) |
| return NULL; |
| |
| memcpy(delta, d, sizeof(git_diff_delta)); |
| |
| delta->old_file.path = git_pool_strdup(pool, d->old_file.path); |
| if (delta->old_file.path == NULL) |
| goto fail; |
| |
| if (d->new_file.path != d->old_file.path) { |
| delta->new_file.path = git_pool_strdup(pool, d->new_file.path); |
| if (delta->new_file.path == NULL) |
| goto fail; |
| } else { |
| delta->new_file.path = delta->old_file.path; |
| } |
| |
| return delta; |
| |
| fail: |
| git__free(delta); |
| return NULL; |
| } |
| |
| static git_diff_delta *diff_delta__merge_like_cgit( |
| const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) |
| { |
| git_diff_delta *dup; |
| |
| /* Emulate C git for merging two diffs (a la 'git diff <sha>'). |
| * |
| * When C git does a diff between the work dir and a tree, it actually |
| * diffs with the index but uses the workdir contents. This emulates |
| * those choices so we can emulate the type of diff. |
| * |
| * We have three file descriptions here, let's call them: |
| * f1 = a->old_file |
| * f2 = a->new_file AND b->old_file |
| * f3 = b->new_file |
| */ |
| |
| /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ |
| if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) |
| return diff_delta__dup(a, pool); |
| |
| /* otherwise, base this diff on the 'b' diff */ |
| if ((dup = diff_delta__dup(b, pool)) == NULL) |
| return NULL; |
| |
| /* If 'a' status is uninteresting, then we're done */ |
| if (a->status == GIT_DELTA_UNMODIFIED) |
| return dup; |
| |
| assert(a->status != GIT_DELTA_UNMODIFIED); |
| assert(b->status != GIT_DELTA_UNMODIFIED); |
| |
| /* A cgit exception is that the diff of a file that is only in the |
| * index (i.e. not in HEAD nor workdir) is given as empty. |
| */ |
| if (dup->status == GIT_DELTA_DELETED) { |
| if (a->status == GIT_DELTA_ADDED) |
| dup->status = GIT_DELTA_UNMODIFIED; |
| /* else don't overwrite DELETE status */ |
| } else { |
| dup->status = a->status; |
| } |
| |
| git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); |
| dup->old_file.mode = a->old_file.mode; |
| dup->old_file.size = a->old_file.size; |
| dup->old_file.flags = a->old_file.flags; |
| |
| return dup; |
| } |
| |
| int git_diff_merge( |
| git_diff_list *onto, |
| const git_diff_list *from) |
| { |
| int error = 0; |
| git_pool onto_pool; |
| git_vector onto_new; |
| git_diff_delta *delta; |
| bool ignore_case = false; |
| unsigned int i, j; |
| |
| assert(onto && from); |
| |
| if (!from->deltas.length) |
| return 0; |
| |
| if (git_vector_init( |
| &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 || |
| git_pool_init(&onto_pool, 1, 0) < 0) |
| return -1; |
| |
| if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || |
| (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) |
| { |
| ignore_case = true; |
| |
| /* This function currently only supports merging diff lists that |
| * are sorted identically. */ |
| assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && |
| (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); |
| } |
| |
| for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { |
| git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); |
| const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); |
| int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); |
| |
| if (cmp < 0) { |
| delta = diff_delta__dup(o, &onto_pool); |
| i++; |
| } else if (cmp > 0) { |
| delta = diff_delta__dup(f, &onto_pool); |
| j++; |
| } else { |
| delta = diff_delta__merge_like_cgit(o, f, &onto_pool); |
| i++; |
| j++; |
| } |
| |
| /* the ignore rules for the target may not match the source |
| * or the result of a merged delta could be skippable... |
| */ |
| if (git_diff_delta__should_skip(&onto->opts, delta)) { |
| git__free(delta); |
| continue; |
| } |
| |
| if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) |
| break; |
| } |
| |
| if (!error) { |
| git_vector_swap(&onto->deltas, &onto_new); |
| git_pool_swap(&onto->pool, &onto_pool); |
| onto->new_src = from->new_src; |
| |
| /* prefix strings also come from old pool, so recreate those.*/ |
| onto->opts.old_prefix = |
| git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); |
| onto->opts.new_prefix = |
| git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); |
| } |
| |
| git_vector_foreach(&onto_new, i, delta) |
| git__free(delta); |
| git_vector_free(&onto_new); |
| git_pool_clear(&onto_pool); |
| |
| return error; |
| } |
| |
| #define DEFAULT_THRESHOLD 50 |
| #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 |
| #define DEFAULT_TARGET_LIMIT 200 |
| |
| static int normalize_find_opts( |
| git_diff_list *diff, |
| git_diff_find_options *opts, |
| git_diff_find_options *given) |
| { |
| git_config *cfg = NULL; |
| const char *val; |
| |
| if (diff->repo != NULL && |
| git_repository_config__weakptr(&cfg, diff->repo) < 0) |
| return -1; |
| |
| if (given != NULL) |
| memcpy(opts, given, sizeof(*opts)); |
| else { |
| memset(opts, 0, sizeof(*opts)); |
| |
| opts->flags = GIT_DIFF_FIND_RENAMES; |
| |
| if (git_config_get_string(&val, cfg, "diff.renames") < 0) |
| giterr_clear(); |
| else if (val && |
| (!strcasecmp(val, "copies") || !strcasecmp(val, "copy"))) |
| opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; |
| } |
| |
| /* some flags imply others */ |
| |
| if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) |
| opts->flags |= GIT_DIFF_FIND_RENAMES; |
| |
| if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) |
| opts->flags |= GIT_DIFF_FIND_COPIES; |
| |
| #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) |
| |
| if (USE_DEFAULT(opts->rename_threshold)) |
| opts->rename_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) |
| opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->copy_threshold)) |
| opts->copy_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->break_rewrite_threshold)) |
| opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; |
| |
| #undef USE_DEFAULT |
| |
| if (!opts->target_limit) { |
| int32_t limit = 0; |
| |
| opts->target_limit = DEFAULT_TARGET_LIMIT; |
| |
| if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0) |
| giterr_clear(); |
| else if (limit > 0) |
| opts->target_limit = limit; |
| } |
| |
| return 0; |
| } |
| |
| static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) |
| { |
| git_vector onto = GIT_VECTOR_INIT; |
| size_t i; |
| git_diff_delta *delta; |
| |
| if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) |
| return -1; |
| |
| /* build new delta list without TO_DELETE and splitting TO_SPLIT */ |
| git_vector_foreach(&diff->deltas, i, delta) { |
| if (delta->status == GIT_DELTA__TO_DELETE) { |
| git__free(delta); |
| continue; |
| } |
| |
| if (delta->status == GIT_DELTA__TO_SPLIT) { |
| git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); |
| if (!deleted) |
| return -1; |
| |
| deleted->status = GIT_DELTA_DELETED; |
| memset(&deleted->new_file, 0, sizeof(deleted->new_file)); |
| deleted->new_file.path = deleted->old_file.path; |
| deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; |
| |
| git_vector_insert(&onto, deleted); |
| |
| delta->status = GIT_DELTA_ADDED; |
| memset(&delta->old_file, 0, sizeof(delta->old_file)); |
| delta->old_file.path = delta->new_file.path; |
| delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; |
| } |
| |
| git_vector_insert(&onto, delta); |
| } |
| |
| /* swap new delta list into place */ |
| git_vector_sort(&onto); |
| git_vector_swap(&diff->deltas, &onto); |
| git_vector_free(&onto); |
| |
| return 0; |
| } |
| |
| static unsigned int calc_similarity( |
| void *cache, git_diff_file *old_file, git_diff_file *new_file) |
| { |
| GIT_UNUSED(cache); |
| |
| if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) |
| return 100; |
| |
| /* TODO: insert actual similarity algo here */ |
| |
| return 0; |
| } |
| |
| #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) |
| |
| int git_diff_find_similar( |
| git_diff_list *diff, |
| git_diff_find_options *given_opts) |
| { |
| unsigned int i, j, similarity; |
| git_diff_delta *from, *to; |
| git_diff_find_options opts; |
| unsigned int tried_targets, num_changes = 0; |
| git_vector matches = GIT_VECTOR_INIT; |
| |
| if (normalize_find_opts(diff, &opts, given_opts) < 0) |
| return -1; |
| |
| /* first do splits if requested */ |
| |
| if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { |
| git_vector_foreach(&diff->deltas, i, from) { |
| if (from->status != GIT_DELTA_MODIFIED) |
| continue; |
| |
| /* Right now, this doesn't work right because the similarity |
| * algorithm isn't actually implemented... |
| */ |
| similarity = 100; |
| /* calc_similarity(NULL, &from->old_file, from->new_file); */ |
| |
| if (similarity < opts.break_rewrite_threshold) { |
| from->status = GIT_DELTA__TO_SPLIT; |
| num_changes++; |
| } |
| } |
| |
| /* apply splits as needed */ |
| if (num_changes > 0 && |
| apply_splits_and_deletes( |
| diff, diff->deltas.length + num_changes) < 0) |
| return -1; |
| } |
| |
| /* next find the most similar delta for each rename / copy candidate */ |
| |
| if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) |
| return -1; |
| |
| git_vector_foreach(&diff->deltas, i, from) { |
| tried_targets = 0; |
| |
| git_vector_foreach(&diff->deltas, j, to) { |
| if (i == j) |
| continue; |
| |
| switch (to->status) { |
| case GIT_DELTA_ADDED: |
| case GIT_DELTA_UNTRACKED: |
| case GIT_DELTA_RENAMED: |
| case GIT_DELTA_COPIED: |
| break; |
| default: |
| /* only the above status values should be checked */ |
| continue; |
| } |
| |
| /* skip all but DELETED files unless copy detection is on */ |
| if (from->status != GIT_DELTA_DELETED && |
| !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) |
| continue; |
| |
| /* don't check UNMODIFIED files as source unless given option */ |
| if (from->status == GIT_DELTA_UNMODIFIED && |
| !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) |
| continue; |
| |
| /* cap on maximum files we'll examine */ |
| if (++tried_targets > opts.target_limit) |
| break; |
| |
| /* calculate similarity and see if this pair beats the |
| * similarity score of the current best pair. |
| */ |
| similarity = calc_similarity(NULL, &from->old_file, &to->new_file); |
| |
| if (to->similarity < similarity) { |
| to->similarity = similarity; |
| if (git_vector_set(NULL, &matches, j, from) < 0) |
| return -1; |
| } |
| } |
| } |
| |
| /* next rewrite the diffs with renames / copies */ |
| |
| num_changes = 0; |
| |
| git_vector_foreach(&diff->deltas, j, to) { |
| from = GIT_VECTOR_GET(&matches, j); |
| if (!from) { |
| assert(to->similarity == 0); |
| continue; |
| } |
| |
| /* three possible outcomes here: |
| * 1. old DELETED and if over rename threshold, |
| * new becomes RENAMED and old goes away |
| * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and |
| * old is more similar to new than it is to itself, in which |
| * case, new becomes RENAMED and old becomed ADDED |
| * 3. otherwise if over copy threshold, new becomes COPIED |
| */ |
| |
| if (from->status == GIT_DELTA_DELETED) { |
| if (to->similarity < opts.rename_threshold) { |
| to->similarity = 0; |
| continue; |
| } |
| |
| to->status = GIT_DELTA_RENAMED; |
| memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); |
| |
| from->status = GIT_DELTA__TO_DELETE; |
| num_changes++; |
| |
| continue; |
| } |
| |
| if (from->status == GIT_DELTA_MODIFIED && |
| FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && |
| to->similarity > opts.rename_threshold) |
| { |
| similarity = 100; |
| /* calc_similarity(NULL, &from->old_file, from->new_file); */ |
| |
| if (similarity < opts.rename_from_rewrite_threshold) { |
| to->status = GIT_DELTA_RENAMED; |
| memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); |
| |
| from->status = GIT_DELTA_ADDED; |
| memset(&from->old_file, 0, sizeof(from->old_file)); |
| from->old_file.path = to->old_file.path; |
| from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; |
| |
| continue; |
| } |
| } |
| |
| if (to->similarity < opts.copy_threshold) { |
| to->similarity = 0; |
| continue; |
| } |
| |
| /* convert "to" to a COPIED record */ |
| to->status = GIT_DELTA_COPIED; |
| memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); |
| } |
| |
| git_vector_free(&matches); |
| |
| if (num_changes > 0) { |
| assert(num_changes < diff->deltas.length); |
| |
| if (apply_splits_and_deletes( |
| diff, diff->deltas.length - num_changes) < 0) |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| #undef FLAG_SET |