Resubmit [analyzer] Support for naive cross translation unit analysis

Originally submitted as r326323 and r326324.
Reverted in r326432.

Reverting the commit was a mistake.
The breakage was due to invalid build files in our internal buildsystem,
CMakeLists did not have any cyclic dependencies.

Cr-Mirrored-From: https://chromium.googlesource.com/external/github.com/llvm-mirror/clang
Cr-Mirrored-Commit: c476211d8b3b3fe69b7166c86bd88e3e09113371
diff --git a/README.md b/README.md
index 54bfc37..1b6fc48 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,32 @@
 Use `--help` to know more about the commands.
 
 
+How to use the experimental Cross Translation Unit analysis
+-----------------------------------------------------------
+
+To run the CTU analysis, a compilation database file has to be created:
+
+    $ intercept-build <your build command>
+
+To run the Clang Static Analyzer against a compilation database
+with CTU analysis enabled, execute:
+    
+    $ analyze-build --ctu
+
+For CTU analysis an additional (function-definition) collection-phase is required. 
+For debugging purposes, it is possible to separately execute the collection 
+and the analysis phase. By doing this, the intermediate files used for 
+the analysis are kept on the disk in `./ctu-dir`.
+    
+    # Collect and store the data required by the CTU analysis
+    $ analyze-build --ctu-collect-only
+    
+    # Analyze using the previously collected data
+    $ analyze-build --ctu-analyze-only
+
+Use `--help` to get more information about the commands.
+
+
 Limitations
 -----------
 
diff --git a/libscanbuild/__init__.py b/libscanbuild/__init__.py
index 800926e..e7b7487 100644
--- a/libscanbuild/__init__.py
+++ b/libscanbuild/__init__.py
@@ -19,6 +19,9 @@
 
 Execution = collections.namedtuple('Execution', ['pid', 'cwd', 'cmd'])
 
+CtuConfig = collections.namedtuple('CtuConfig', ['collect', 'analyze', 'dir',
+                                                 'func_map_cmd'])
+
 
 def duplicate_check(method):
     """ Predicate to detect duplicated entries.
diff --git a/libscanbuild/analyze.py b/libscanbuild/analyze.py
index 6c129bd..3c93b11 100644
--- a/libscanbuild/analyze.py
+++ b/libscanbuild/analyze.py
@@ -22,16 +22,19 @@
 import subprocess
 import contextlib
 import datetime
+import shutil
+import glob
+from collections import defaultdict
 
 from libscanbuild import command_entry_point, compiler_wrapper, \
-    wrapper_environment, run_build, run_command
+    wrapper_environment, run_build, run_command, CtuConfig
 from libscanbuild.arguments import parse_args_for_scan_build, \
     parse_args_for_analyze_build
 from libscanbuild.intercept import capture
 from libscanbuild.report import document
 from libscanbuild.compilation import split_command, classify_source, \
     compiler_language
-from libscanbuild.clang import get_version, get_arguments
+from libscanbuild.clang import get_version, get_arguments, get_triple_arch
 from libscanbuild.shell import decode
 
 __all__ = ['scan_build', 'analyze_build', 'analyze_compiler_wrapper']
@@ -39,6 +42,9 @@
 COMPILER_WRAPPER_CC = 'analyze-cc'
 COMPILER_WRAPPER_CXX = 'analyze-c++'
 
+CTU_FUNCTION_MAP_FILENAME = 'externalFnMap.txt'
+CTU_TEMP_FNMAP_FOLDER = 'tmpExternalFnMaps'
+
 
 @command_entry_point
 def scan_build():
@@ -56,7 +62,7 @@
             exit_code = capture(args)
             # Run the analyzer against the captured commands.
             if need_analyzer(args.build):
-                run_analyzer_parallel(args)
+                govern_analyzer_runs(args)
         else:
             # Run build command and analyzer with compiler wrappers.
             environment = setup_environment(args)
@@ -75,7 +81,7 @@
     # will re-assign the report directory as new output
     with report_directory(args.output, args.keep_empty) as args.output:
         # Run the analyzer against a compilation db.
-        run_analyzer_parallel(args)
+        govern_analyzer_runs(args)
         # Cover report generation and bug counting.
         number_of_bugs = document(args)
         # Set exit status as it was requested.
@@ -95,6 +101,108 @@
     return len(args) and not re.search('configure|autogen', args[0])
 
 
+def prefix_with(constant, pieces):
+    """ From a sequence create another sequence where every second element
+    is from the original sequence and the odd elements are the prefix.
+
+    eg.: prefix_with(0, [1,2,3]) creates [0, 1, 0, 2, 0, 3] """
+
+    return [elem for piece in pieces for elem in [constant, piece]]
+
+
+def get_ctu_config_from_args(args):
+    """ CTU configuration is created from the chosen phases and dir. """
+
+    return (
+        CtuConfig(collect=args.ctu_phases.collect,
+                  analyze=args.ctu_phases.analyze,
+                  dir=args.ctu_dir,
+                  func_map_cmd=args.func_map_cmd)
+        if hasattr(args, 'ctu_phases') and hasattr(args.ctu_phases, 'dir')
+        else CtuConfig(collect=False, analyze=False, dir='', func_map_cmd=''))
+
+
+def get_ctu_config_from_json(ctu_conf_json):
+    """ CTU configuration is created from the chosen phases and dir. """
+
+    ctu_config = json.loads(ctu_conf_json)
+    # Recover namedtuple from json when coming from analyze-cc or analyze-c++
+    return CtuConfig(collect=ctu_config[0],
+                     analyze=ctu_config[1],
+                     dir=ctu_config[2],
+                     func_map_cmd=ctu_config[3])
+
+
+def create_global_ctu_function_map(func_map_lines):
+    """ Takes iterator of individual function maps and creates a global map
+    keeping only unique names. We leave conflicting names out of CTU.
+
+    :param func_map_lines: Contains the id of a function (mangled name) and
+    the originating source (the corresponding AST file) name.
+    :type func_map_lines: Iterator of str.
+    :returns: Mangled name - AST file pairs.
+    :rtype: List of (str, str) tuples.
+    """
+
+    mangled_to_asts = defaultdict(set)
+
+    for line in func_map_lines:
+        mangled_name, ast_file = line.strip().split(' ', 1)
+        mangled_to_asts[mangled_name].add(ast_file)
+
+    mangled_ast_pairs = []
+
+    for mangled_name, ast_files in mangled_to_asts.items():
+        if len(ast_files) == 1:
+            mangled_ast_pairs.append((mangled_name, next(iter(ast_files))))
+
+    return mangled_ast_pairs
+
+
+def merge_ctu_func_maps(ctudir):
+    """ Merge individual function maps into a global one.
+
+    As the collect phase runs parallel on multiple threads, all compilation
+    units are separately mapped into a temporary file in CTU_TEMP_FNMAP_FOLDER.
+    These function maps contain the mangled names of functions and the source
+    (AST generated from the source) which had them.
+    These files should be merged at the end into a global map file:
+    CTU_FUNCTION_MAP_FILENAME."""
+
+    def generate_func_map_lines(fnmap_dir):
+        """ Iterate over all lines of input files in a determined order. """
+
+        files = glob.glob(os.path.join(fnmap_dir, '*'))
+        files.sort()
+        for filename in files:
+            with open(filename, 'r') as in_file:
+                for line in in_file:
+                    yield line
+
+    def write_global_map(arch, mangled_ast_pairs):
+        """ Write (mangled function name, ast file) pairs into final file. """
+
+        extern_fns_map_file = os.path.join(ctudir, arch,
+                                           CTU_FUNCTION_MAP_FILENAME)
+        with open(extern_fns_map_file, 'w') as out_file:
+            for mangled_name, ast_file in mangled_ast_pairs:
+                out_file.write('%s %s\n' % (mangled_name, ast_file))
+
+    triple_arches = glob.glob(os.path.join(ctudir, '*'))
+    for triple_path in triple_arches:
+        if os.path.isdir(triple_path):
+            triple_arch = os.path.basename(triple_path)
+            fnmap_dir = os.path.join(ctudir, triple_arch,
+                                     CTU_TEMP_FNMAP_FOLDER)
+
+            func_map_lines = generate_func_map_lines(fnmap_dir)
+            mangled_ast_pairs = create_global_ctu_function_map(func_map_lines)
+            write_global_map(triple_arch, mangled_ast_pairs)
+
+            # Remove all temporary files
+            shutil.rmtree(fnmap_dir, ignore_errors=True)
+
+
 def run_analyzer_parallel(args):
     """ Runs the analyzer against the given compilation database. """
 
@@ -109,7 +217,8 @@
         'output_format': args.output_format,
         'output_failures': args.output_failures,
         'direct_args': analyzer_params(args),
-        'force_debug': args.force_debug
+        'force_debug': args.force_debug,
+        'ctu': get_ctu_config_from_args(args)
     }
 
     logging.debug('run analyzer against compilation database')
@@ -127,6 +236,38 @@
         pool.join()
 
 
+def govern_analyzer_runs(args):
+    """ Governs multiple runs in CTU mode or runs once in normal mode. """
+
+    ctu_config = get_ctu_config_from_args(args)
+    # If we do a CTU collect (1st phase) we remove all previous collection
+    # data first.
+    if ctu_config.collect:
+        shutil.rmtree(ctu_config.dir, ignore_errors=True)
+
+    # If the user asked for a collect (1st) and analyze (2nd) phase, we do an
+    # all-in-one run where we deliberately remove collection data before and
+    # also after the run. If the user asks only for a single phase data is
+    # left so multiple analyze runs can use the same data gathered by a single
+    # collection run.
+    if ctu_config.collect and ctu_config.analyze:
+        # CTU strings are coming from args.ctu_dir and func_map_cmd,
+        # so we can leave it empty
+        args.ctu_phases = CtuConfig(collect=True, analyze=False,
+                                    dir='', func_map_cmd='')
+        run_analyzer_parallel(args)
+        merge_ctu_func_maps(ctu_config.dir)
+        args.ctu_phases = CtuConfig(collect=False, analyze=True,
+                                    dir='', func_map_cmd='')
+        run_analyzer_parallel(args)
+        shutil.rmtree(ctu_config.dir, ignore_errors=True)
+    else:
+        # Single runs (collect or analyze) are launched from here.
+        run_analyzer_parallel(args)
+        if ctu_config.collect:
+            merge_ctu_func_maps(ctu_config.dir)
+
+
 def setup_environment(args):
     """ Set up environment for build command to interpose compiler wrapper. """
 
@@ -140,7 +281,8 @@
         'ANALYZE_BUILD_REPORT_FORMAT': args.output_format,
         'ANALYZE_BUILD_REPORT_FAILURES': 'yes' if args.output_failures else '',
         'ANALYZE_BUILD_PARAMETERS': ' '.join(analyzer_params(args)),
-        'ANALYZE_BUILD_FORCE_DEBUG': 'yes' if args.force_debug else ''
+        'ANALYZE_BUILD_FORCE_DEBUG': 'yes' if args.force_debug else '',
+        'ANALYZE_BUILD_CTU': json.dumps(get_ctu_config_from_args(args))
     })
     return environment
 
@@ -173,7 +315,8 @@
                                  '').split(' '),
         'force_debug': os.getenv('ANALYZE_BUILD_FORCE_DEBUG'),
         'directory': execution.cwd,
-        'command': [execution.cmd[0], '-c'] + compilation.flags
+        'command': [execution.cmd[0], '-c'] + compilation.flags,
+        'ctu': get_ctu_config_from_json(os.getenv('ANALYZE_BUILD_CTU'))
     }
     # call static analyzer against the compilation
     for source in compilation.files:
@@ -223,14 +366,6 @@
     """ A group of command line arguments can mapped to command
     line arguments of the analyzer. This method generates those. """
 
-    def prefix_with(constant, pieces):
-        """ From a sequence create another sequence where every second element
-        is from the original sequence and the odd elements are the prefix.
-
-        eg.: prefix_with(0, [1,2,3]) creates [0, 1, 0, 2, 0, 3] """
-
-        return [elem for piece in pieces for elem in [constant, piece]]
-
     result = []
 
     if args.store_model:
@@ -294,8 +429,9 @@
           'direct_args',  # arguments from command line
           'force_debug',  # kill non debug macros
           'output_dir',  # where generated report files shall go
-          'output_format',  # it's 'plist' or 'html' or both
-          'output_failures'])  # generate crash reports or not
+          'output_format',  # it's 'plist', 'html', both or plist-multi-file
+          'output_failures',  # generate crash reports or not
+          'ctu'])  # ctu control options
 def run(opts):
     """ Entry point to run (or not) static analyzer against a single entry
     of the compilation database.
@@ -383,7 +519,10 @@
 
     def target():
         """ Creates output file name for reports. """
-        if opts['output_format'] in {'plist', 'plist-html'}:
+        if opts['output_format'] in {
+                'plist',
+                'plist-html',
+                'plist-multi-file'}:
             (handle, name) = tempfile.mkstemp(prefix='report-',
                                               suffix='.plist',
                                               dir=opts['output_dir'])
@@ -407,8 +546,109 @@
         return result
 
 
+def func_map_list_src_to_ast(func_src_list):
+    """ Turns textual function map list with source files into a
+    function map list with ast files. """
+
+    func_ast_list = []
+    for fn_src_txt in func_src_list:
+        mangled_name, path = fn_src_txt.split(" ", 1)
+        # Normalize path on windows as well
+        path = os.path.splitdrive(path)[1]
+        # Make relative path out of absolute
+        path = path[1:] if path[0] == os.sep else path
+        ast_path = os.path.join("ast", path + ".ast")
+        func_ast_list.append(mangled_name + " " + ast_path)
+    return func_ast_list
+
+
+@require(['clang', 'directory', 'flags', 'direct_args', 'file', 'ctu'])
+def ctu_collect_phase(opts):
+    """ Preprocess source by generating all data needed by CTU analysis. """
+
+    def generate_ast(triple_arch):
+        """ Generates ASTs for the current compilation command. """
+
+        args = opts['direct_args'] + opts['flags']
+        ast_joined_path = os.path.join(opts['ctu'].dir, triple_arch, 'ast',
+                                       os.path.realpath(opts['file'])[1:] +
+                                       '.ast')
+        ast_path = os.path.abspath(ast_joined_path)
+        ast_dir = os.path.dirname(ast_path)
+        if not os.path.isdir(ast_dir):
+            try:
+                os.makedirs(ast_dir)
+            except OSError:
+                # In case an other process already created it.
+                pass
+        ast_command = [opts['clang'], '-emit-ast']
+        ast_command.extend(args)
+        ast_command.append('-w')
+        ast_command.append(opts['file'])
+        ast_command.append('-o')
+        ast_command.append(ast_path)
+        logging.debug("Generating AST using '%s'", ast_command)
+        run_command(ast_command, cwd=opts['directory'])
+
+    def map_functions(triple_arch):
+        """ Generate function map file for the current source. """
+
+        args = opts['direct_args'] + opts['flags']
+        funcmap_command = [opts['ctu'].func_map_cmd]
+        funcmap_command.append(opts['file'])
+        funcmap_command.append('--')
+        funcmap_command.extend(args)
+        logging.debug("Generating function map using '%s'", funcmap_command)
+        func_src_list = run_command(funcmap_command, cwd=opts['directory'])
+        func_ast_list = func_map_list_src_to_ast(func_src_list)
+        extern_fns_map_folder = os.path.join(opts['ctu'].dir, triple_arch,
+                                             CTU_TEMP_FNMAP_FOLDER)
+        if not os.path.isdir(extern_fns_map_folder):
+            try:
+                os.makedirs(extern_fns_map_folder)
+            except OSError:
+                # In case an other process already created it.
+                pass
+        if func_ast_list:
+            with tempfile.NamedTemporaryFile(mode='w',
+                                             dir=extern_fns_map_folder,
+                                             delete=False) as out_file:
+                out_file.write("\n".join(func_ast_list) + "\n")
+
+    cwd = opts['directory']
+    cmd = [opts['clang'], '--analyze'] + opts['direct_args'] + opts['flags'] \
+        + [opts['file']]
+    triple_arch = get_triple_arch(cmd, cwd)
+    generate_ast(triple_arch)
+    map_functions(triple_arch)
+
+
+@require(['ctu'])
+def dispatch_ctu(opts, continuation=run_analyzer):
+    """ Execute only one phase of 2 phases of CTU if needed. """
+
+    ctu_config = opts['ctu']
+
+    if ctu_config.collect or ctu_config.analyze:
+        assert ctu_config.collect != ctu_config.analyze
+        if ctu_config.collect:
+            return ctu_collect_phase(opts)
+        if ctu_config.analyze:
+            cwd = opts['directory']
+            cmd = [opts['clang'], '--analyze'] + opts['direct_args'] \
+                + opts['flags'] + [opts['file']]
+            triarch = get_triple_arch(cmd, cwd)
+            ctu_options = ['ctu-dir=' + os.path.join(ctu_config.dir, triarch),
+                           'experimental-enable-naive-ctu-analysis=true']
+            analyzer_options = prefix_with('-analyzer-config', ctu_options)
+            direct_options = prefix_with('-Xanalyzer', analyzer_options)
+            opts['direct_args'].extend(direct_options)
+
+    return continuation(opts)
+
+
 @require(['flags', 'force_debug'])
-def filter_debug_flags(opts, continuation=run_analyzer):
+def filter_debug_flags(opts, continuation=dispatch_ctu):
     """ Filter out nondebug macros when requested. """
 
     if opts.pop('force_debug'):
@@ -475,6 +715,7 @@
         logging.debug('analysis, on default arch')
         return continuation(opts)
 
+
 # To have good results from static analyzer certain compiler options shall be
 # omitted. The compiler flag filtering only affects the static analyzer run.
 #
diff --git a/libscanbuild/arguments.py b/libscanbuild/arguments.py
index 2735123..00679a4 100644
--- a/libscanbuild/arguments.py
+++ b/libscanbuild/arguments.py
@@ -18,8 +18,8 @@
 import argparse
 import logging
 import tempfile
-from libscanbuild import reconfigure_logging
-from libscanbuild.clang import get_checkers
+from libscanbuild import reconfigure_logging, CtuConfig
+from libscanbuild.clang import get_checkers, is_ctu_capable
 
 __all__ = ['parse_args_for_intercept_build', 'parse_args_for_analyze_build',
            'parse_args_for_scan_build']
@@ -98,6 +98,11 @@
         # add cdb parameter invisibly to make report module working.
         args.cdb = 'compile_commands.json'
 
+    # Make ctu_dir an abspath as it is needed inside clang
+    if not from_build_command and hasattr(args, 'ctu_phases') \
+            and hasattr(args.ctu_phases, 'dir'):
+        args.ctu_dir = os.path.abspath(args.ctu_dir)
+
 
 def validate_args_for_analyze(parser, args, from_build_command):
     """ Command line parsing is done by the argparse module, but semantic
@@ -122,6 +127,18 @@
     elif not from_build_command and not os.path.exists(args.cdb):
         parser.error(message='compilation database is missing')
 
+    # If the user wants CTU mode
+    if not from_build_command and hasattr(args, 'ctu_phases') \
+            and hasattr(args.ctu_phases, 'dir'):
+        # If CTU analyze_only, the input directory should exist
+        if args.ctu_phases.analyze and not args.ctu_phases.collect \
+                and not os.path.exists(args.ctu_dir):
+            parser.error(message='missing CTU directory')
+        # Check CTU capability via checking clang-func-mapping
+        if not is_ctu_capable(args.func_map_cmd):
+            parser.error(message="""This version of clang does not support CTU
+            functionality or clang-func-mapping command not found.""")
+
 
 def create_intercept_parser():
     """ Creates a parser for command-line arguments to 'intercept'. """
@@ -218,7 +235,15 @@
         default='html',
         action='store_const',
         help="""Cause the results as a set of .html and .plist files.""")
-    # TODO: implement '-view '
+    format_group.add_argument(
+        '--plist-multi-file',
+        '-plist-multi-file',
+        dest='output_format',
+        const='plist-multi-file',
+        default='html',
+        action='store_const',
+        help="""Cause the results as a set of .plist files with extra
+        information on related files.""")
 
     advanced = parser.add_argument_group('advanced options')
     advanced.add_argument(
@@ -333,6 +358,51 @@
     if from_build_command:
         parser.add_argument(
             dest='build', nargs=argparse.REMAINDER, help="""Command to run.""")
+    else:
+        ctu = parser.add_argument_group('cross translation unit analysis')
+        ctu_mutex_group = ctu.add_mutually_exclusive_group()
+        ctu_mutex_group.add_argument(
+            '--ctu',
+            action='store_const',
+            const=CtuConfig(collect=True, analyze=True,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform cross translation unit (ctu) analysis (both collect
+            and analyze phases) using default <ctu-dir> for temporary output.
+            At the end of the analysis, the temporary directory is removed.""")
+        ctu.add_argument(
+            '--ctu-dir',
+            metavar='<ctu-dir>',
+            dest='ctu_dir',
+            default='ctu-dir',
+            help="""Defines the temporary directory used between ctu
+            phases.""")
+        ctu_mutex_group.add_argument(
+            '--ctu-collect-only',
+            action='store_const',
+            const=CtuConfig(collect=True, analyze=False,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform only the collect phase of ctu.
+            Keep <ctu-dir> for further use.""")
+        ctu_mutex_group.add_argument(
+            '--ctu-analyze-only',
+            action='store_const',
+            const=CtuConfig(collect=False, analyze=True,
+                            dir='', func_map_cmd=''),
+            dest='ctu_phases',
+            help="""Perform only the analyze phase of ctu. <ctu-dir> should be
+            present and will not be removed after analysis.""")
+        ctu.add_argument(
+            '--use-func-map-cmd',
+            metavar='<path>',
+            dest='func_map_cmd',
+            default='clang-func-mapping',
+            help="""'%(prog)s' uses the 'clang-func-mapping' executable
+            relative to itself for generating function maps for static
+            analysis. One can override this behavior with this option by using
+            the 'clang-func-mapping' packaged with Xcode (on OS X) or from the
+            PATH.""")
     return parser
 
 
diff --git a/libscanbuild/clang.py b/libscanbuild/clang.py
index 192e708..ab42206 100644
--- a/libscanbuild/clang.py
+++ b/libscanbuild/clang.py
@@ -8,11 +8,13 @@
 Since Clang command line interface is so rich, but this project is using only
 a subset of that, it makes sense to create a function specific wrapper. """
 
+import subprocess
 import re
 from libscanbuild import run_command
 from libscanbuild.shell import decode
 
-__all__ = ['get_version', 'get_arguments', 'get_checkers']
+__all__ = ['get_version', 'get_arguments', 'get_checkers', 'is_ctu_capable',
+           'get_triple_arch']
 
 # regex for activated checker
 ACTIVE_CHECKER_PATTERN = re.compile(r'^-analyzer-checker=(.*)$')
@@ -152,3 +154,26 @@
         raise Exception('Could not query Clang for available checkers.')
 
     return checkers
+
+
+def is_ctu_capable(func_map_cmd):
+    """ Detects if the current (or given) clang and function mapping
+    executables are CTU compatible. """
+
+    try:
+        run_command([func_map_cmd, '-version'])
+    except (OSError, subprocess.CalledProcessError):
+        return False
+    return True
+
+
+def get_triple_arch(command, cwd):
+    """Returns the architecture part of the target triple for the given
+    compilation command. """
+
+    cmd = get_arguments(command, cwd)
+    try:
+        separator = cmd.index("-triple")
+        return cmd[separator + 1]
+    except (IndexError, ValueError):
+        return ""
diff --git a/libscanbuild/report.py b/libscanbuild/report.py
index 54b9695..b3753c1 100644
--- a/libscanbuild/report.py
+++ b/libscanbuild/report.py
@@ -13,7 +13,6 @@
 import os.path
 import sys
 import shutil
-import itertools
 import plistlib
 import glob
 import json
@@ -255,24 +254,29 @@
 
 
 def read_bugs(output_dir, html):
+    # type: (str, bool) -> Generator[Dict[str, Any], None, None]
     """ Generate a unique sequence of bugs from given output directory.
 
     Duplicates can be in a project if the same module was compiled multiple
     times with different compiler options. These would be better to show in
     the final report (cover) only once. """
 
-    parser = parse_bug_html if html else parse_bug_plist
-    pattern = '*.html' if html else '*.plist'
+    def empty(file_name):
+        return os.stat(file_name).st_size == 0
 
     duplicate = duplicate_check(
         lambda bug: '{bug_line}.{bug_path_length}:{bug_file}'.format(**bug))
 
-    bugs = itertools.chain.from_iterable(
-        # parser creates a bug generator not the bug itself
-        parser(filename)
-        for filename in glob.iglob(os.path.join(output_dir, pattern)))
+    # get the right parser for the job.
+    parser = parse_bug_html if html else parse_bug_plist
+    # get the input files, which are not empty.
+    pattern = os.path.join(output_dir, '*.html' if html else '*.plist')
+    bug_files = (file for file in glob.iglob(pattern) if not empty(file))
 
-    return (bug for bug in bugs if not duplicate(bug))
+    for bug_file in bug_files:
+        for bug in parser(bug_file):
+            if not duplicate(bug):
+                yield bug
 
 
 def parse_bug_plist(filename):
diff --git a/tests/unit/test_analyze.py b/tests/unit/test_analyze.py
index a250ff2..9964a29 100644
--- a/tests/unit/test_analyze.py
+++ b/tests/unit/test_analyze.py
@@ -4,12 +4,12 @@
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 
-import libear
-import libscanbuild.analyze as sut
 import unittest
 import re
 import os
 import os.path
+import libear
+import libscanbuild.analyze as sut
 
 
 class ReportDirectoryTest(unittest.TestCase):
@@ -333,3 +333,83 @@
 
     def test_method_exception_not_caught(self):
         self.assertRaises(Exception, method_exception_from_inside, dict())
+
+
+class PrefixWithTest(unittest.TestCase):
+
+    def test_gives_empty_on_empty(self):
+        res = sut.prefix_with(0, [])
+        self.assertFalse(res)
+
+    def test_interleaves_prefix(self):
+        res = sut.prefix_with(0, [1, 2, 3])
+        self.assertListEqual([0, 1, 0, 2, 0, 3], res)
+
+
+class MergeCtuMapTest(unittest.TestCase):
+
+    def test_no_map_gives_empty(self):
+        pairs = sut.create_global_ctu_function_map([])
+        self.assertFalse(pairs)
+
+    def test_multiple_maps_merged(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun3#I# ast/fun3.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun3#I#', 'ast/fun3.c.ast') in pairs)
+        self.assertEqual(3, len(pairs))
+
+    def test_not_unique_func_left_out(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun1#I# ast/fun7.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertFalse(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertFalse(('c:@F@fun1#I#', 'ast/fun7.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertEqual(1, len(pairs))
+
+    def test_duplicates_are_kept(self):
+        concat_map = ['c:@F@fun1#I# ast/fun1.c.ast',
+                      'c:@F@fun2#I# ast/fun2.c.ast',
+                      'c:@F@fun1#I# ast/fun1.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/fun1.c.ast') in pairs)
+        self.assertTrue(('c:@F@fun2#I#', 'ast/fun2.c.ast') in pairs)
+        self.assertEqual(2, len(pairs))
+
+    def test_space_handled_in_source(self):
+        concat_map = ['c:@F@fun1#I# ast/f un.c.ast']
+        pairs = sut.create_global_ctu_function_map(concat_map)
+        self.assertTrue(('c:@F@fun1#I#', 'ast/f un.c.ast') in pairs)
+        self.assertEqual(1, len(pairs))
+
+
+class FuncMapSrcToAstTest(unittest.TestCase):
+
+    def test_empty_gives_empty(self):
+        fun_ast_lst = sut.func_map_list_src_to_ast([])
+        self.assertFalse(fun_ast_lst)
+
+    def test_sources_to_asts(self):
+        fun_src_lst = ['c:@F@f1#I# ' + os.path.join(os.sep + 'path', 'f1.c'),
+                       'c:@F@f2#I# ' + os.path.join(os.sep + 'path', 'f2.c')]
+        fun_ast_lst = sut.func_map_list_src_to_ast(fun_src_lst)
+        self.assertTrue('c:@F@f1#I# ' +
+                        os.path.join('ast', 'path', 'f1.c.ast')
+                        in fun_ast_lst)
+        self.assertTrue('c:@F@f2#I# ' +
+                        os.path.join('ast', 'path', 'f2.c.ast')
+                        in fun_ast_lst)
+        self.assertEqual(2, len(fun_ast_lst))
+
+    def test_spaces_handled(self):
+        fun_src_lst = ['c:@F@f1#I# ' + os.path.join(os.sep + 'path', 'f 1.c')]
+        fun_ast_lst = sut.func_map_list_src_to_ast(fun_src_lst)
+        self.assertTrue('c:@F@f1#I# ' +
+                        os.path.join('ast', 'path', 'f 1.c.ast')
+                        in fun_ast_lst)
+        self.assertEqual(1, len(fun_ast_lst))
diff --git a/tests/unit/test_clang.py b/tests/unit/test_clang.py
index eef8c26..07ac4d9 100644
--- a/tests/unit/test_clang.py
+++ b/tests/unit/test_clang.py
@@ -92,3 +92,15 @@
         self.assertEqual('Checker One description', result.get('checker.one'))
         self.assertTrue('checker.two' in result)
         self.assertEqual('Checker Two description', result.get('checker.two'))
+
+
+class ClangIsCtuCapableTest(unittest.TestCase):
+    def test_ctu_not_found(self):
+        is_ctu = sut.is_ctu_capable('not-found-clang-func-mapping')
+        self.assertFalse(is_ctu)
+
+
+class ClangGetTripleArchTest(unittest.TestCase):
+    def test_arch_is_not_empty(self):
+        arch = sut.get_triple_arch(['clang', '-E', '-'], '.')
+        self.assertTrue(len(arch) > 0)