In [None]:
# This code is used to extract citation information from WTO reports in Nyarko and Hsiang, "Conforming against Expectations:
# The Formalism of Non-Lawyers at the World Trade Organization", Journal of Legal Studies (2019). The code is written in
# Python 2 and has not been updated to work in Python 3.
# The decisions are assumed to be in .html format, which are available at www.juliannyarko.com.


# Load packages
import os
import re
import csv
from operator import itemgetter
from itertools import groupby
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import sys
import html2text
from __future__ import division

In [None]:
def citation_info(filename, directory):
    # This function extracts the citation/precedent information from decisions
    
    if 'Pt' in filename and not 'Pt1.' in filename:
        return None
    elif 'Pt1.' in filename:
        # If the file consists of multiple parts, load the entire text first
        document_text_parts = []
        i = 1
        file_trigger = 0
        htm = ''
        while file_trigger == 0:
            # The while-loop loops through all parts of a file until it encounters an error. If so, it sets
            # file_trigger = 1 and stops looping
            filename_parts = re.sub('Pt1', 'Pt'+str(i), filename)
            try:
                with open(directory + '/' + filename_parts,'rU') as f: # open html
                    htm_part = f.read() # read html
                    htm = htm + htm_part # add html code to the html code of the other parts
                html = htm_part.decode('ascii', 'ignore') # decode html
                document_text_parts.append(html2text.html2text(html)) # Extract plain text from html
                i = i +1
            except:
                file_trigger = 1
        entire_text = ('\n\n').join(document_text_parts) # join the text of all paragraphs. 
        #Note: 'entire_text' is only used to count the total number of words. It has no other purpose. I will create a differently
        # formatted version of the text below.
    else:
        # If the file does not consist of multiple parts, just take the text of the one file
        with open(directory + '/' + filename,'rU') as f:
            htm = f.read()
        html = htm.decode('ascii', 'ignore')
        entire_text = html2text.html2text(html)
    # rule_trigger is used to see what structure the document has, i.e. whether it has Findings and Conclusion, no Conclusion etc.
    # This makes it easier to identify the findings section.
    rule_trigger = 0
    # I change text that confuses the algorithm in identifying the Findings section, such that, if it searches for 'findings,'
    # it won't match these strings.
    htm = re.sub(r'ADDITIONAL FINDINGS', 'ADDITIONAL FINDINGZ', htm)
    htm = re.sub(r'ON THE FINDINGS AND CONCLUSIONS', 'ON THE FINDINGZ AND KONCLUSIONS', htm)
    htm = re.sub(r'SUMMARY OF FINDINGS', 'SUMMARY OF FINDINGZ', htm)
    htm = re.sub(r'OUR MAIN FINDINGS', 'OUR MAIN FINDINGZ', htm)
    htm = re.sub(r'LIMIT ITS FINDINGS', 'LIMIT ITS FINDINGZ', htm)
    htm = re.sub(r'PUBLIC FACTUAL FINDINGS', 'PUBLIC FACTUAL FINDINGZ', htm)
    htm = re.sub(r'FINDINGS OF THE PANEL AND APPELLATE BODY', 'FINDINGZ OF THE PANEL AND APPELLATE BODY', htm)
    htm = re.sub('FINDINGS OF THE PANEL AND THE APPELLATE BODY', 'FINDINGZ OF THE PANEL AND THE APPELLATE BODY', htm)
    htm = re.sub('REVIEW OF THE FINDINGS BY THE US INVESTIGATING', 'REVIEW OF THE FINDINGZ BY THE US INVESTIGATING', htm)
    
    
    
    try:
        # Search for findings section
        htm = re.search(r'FINDINGS(?!.+FINDINGS)(.+?CONCL)', htm, re.DOTALL).group()
        rule_trigger = 1
    except:
        pass
    if rule_trigger == 0:
        try:
            htm = re.search(r'(FINDINGS(?!.+FINDINGS)(.+))', htm, re.DOTALL).group()
            htm = re.search(r'(FINDINGS(?! [A-Z])(?!.+FINDINGS)(.+))', htm, re.DOTALL).group()
            rule_trigger = 1
        except:
            pass

    # Regular expression that omits every </*> prior to the first <*>, as BeautifulSoup needs a html that starts with <*>
    # to work adequatly.
    htm = re.search(r'(<(?!/).*)', htm, re.DOTALL).group()
    # Standardize how hyphens are used, which is important because hyphens have case names in them
    htm = re.sub('</i>- <i>', '- ', htm)
    htm = re.sub('</i> - <i>', ' - ', htm)
    htm = re.sub('</i> -<i>', ' -', htm)
    htm = re.sub('</i>– <i>', '– ', htm)
    htm = re.sub('</i> – <i>', ' – ', htm)
    htm = re.sub('</i> –<i>', ' –', htm)
    htm_text = htm.decode('ascii', 'ignore')
    htm_text = html2text.html2text(htm_text) # get text form html
    htm_text = re.sub('\n', ' ', htm_text) # substitute line skips with spaces
    htm_text = re.sub('  +', ' ', htm_text) # remove more than one consecutive spaces
    htm_text = re.sub('that contracting party was entitled,', 'that contracting party was entitled,"', htm_text)
    text = BeautifulSoup(htm)
    
    # I implement the first method for finding case names, which is looking for italicized text that includes a hyphen.
    elements = text.find_all('i') # find italicized text
    matches_var1 = []
    matches_var2 = []
    for element in elements:
        new = element.get_text().encode('utf8', 'ignore')
        if ' – ' in new or ' - ' in new: # if there is a hyphen in the italic text, it is a case name
            matches_var1.append(new)  
            

    document_text = re.sub(r'[).,(;:]', ' ', htm_text.lower()) # Remove punctuation
    document_text = re.sub(r'\s+', ' ', document_text) # remove whitespace
    matches_wto = re.findall(r'(wt/ds[0-9]+.+?) ', document_text) # find citations in format wt/ds##
    matches_gatt = re.findall(r'(bisd.+?) ', document_text) # find citations in format bisd

    unique_matches_wto = list(set(matches_wto)) # unique number of citations
    match_dict = {}
    for match in unique_matches_wto:  
        match_dict[match] = matches_wto.count(match)
    # 'bad' citations are citations where the case cites its own dispute number. I remove those
    bad = max(match_dict, key=match_dict.get)
    frequency = round(match_dict[bad] * 0.8, 0)
    bad_triggers = {match: freq for match, freq in match_dict.iteritems() if freq > frequency}.keys()

    triggers = []
    for trigger in bad_triggers:
        bad_trigger = re.search(r'ds[0-9]+?/', trigger).group()
        triggers.append(bad_trigger)
        if bad_trigger.startswith('ds00'):
            triggers.append(re.sub('ds00', 'ds', bad_trigger))
        elif bad_trigger.startswith('ds0'):
            triggers.append(re.sub('ds0', 'ds', bad_trigger))
    triggers = set(triggers)
   
    for row in matches_wto:
        if any(trigger in row for trigger in triggers):
            matches_wto = filter(lambda a: a != row, matches_wto)
    matches_var2 = matches_gatt + matches_wto
    if len(matches_var1) > len(matches_var2):
        matches = matches_var1
    else:
        matches = matches_var2
    
    # Count words in text
    entire_text = re.sub('[^A-Za-z0-9 ]+', '', entire_text)
    entire_text = re.sub('\n', ' ', entire_text)
    entire_text = re.sub('  +', ' ', entire_text)
    num_words_total = len(entire_text.split())
    htm_text = re.sub('[^A-Za-z0-9 ]+', '', htm_text)
    num_words = len(htm_text.split())
    if num_words_total < num_words:
        num_words_total = num_words
   
    abr_counter = 0
    gatt_counter = 0
    cit_counter = 0
    case_variety = []
    abr_variety = []
    all_years = []
    mean_year = None
    
    # Store everything in a dictionary
    my_dict = {}
    my_dict['prec_abs'] = cit_counter
    my_dict['gatt_abs'] = gatt_counter
    my_dict['wto_abs'] = cit_counter - gatt_counter
    my_dict['cite_variety'] = len(set(case_variety))
    my_dict['abr_variety'] = len(set(abr_variety))
    my_dict['abr_abs'] = abr_counter
    my_dict['panel_abs'] = cit_counter - abr_counter
    my_dict['num_words'] = num_words
    my_dict['num_words_total'] = num_words_total
    my_dict['prec_abs_italic'] = len(matches_var1)
    my_dict['prec_abs_abr'] = len(matches_var2)
    my_dict['mean_ab_year'] = mean_year
        
    rx1 = re.compile('(.*)PT(.*)') # Regular expression that checks if filename consists of multiple parts.
    rx2 = re.compile('(.*)Pt(.*)')
    if re.match(rx1, filename): # if it does, delete the part with 'PT' in it
        filename_list = filename.split('_')
        filename_list = [string for string in filename_list if not re.match(rx1, string)]
        name = '_'.join(filename_list)+'.html'  
    elif re.match(rx2, filename): # if it does, delete the part with 'PT' in it
        filename_list = filename.split('_')
        filename_list = [string for string in filename_list if not re.match(rx2, string)]
        name = '_'.join(filename_list)+'.html'
    else:
        name = filename
    my_dict['decision'] = name
    my_dict['filename'] = filename
    # Return the dictionary
    return my_dict

In [None]:
# Now we can loop through and collect all citations and store the output in all_citations_complete.csv
directory = 'C:/Users/<username>/<location>/decisions' # specify location of decisions
filenames = os.listdir(directory)
counter = 0
all_dta = []
# collect all file names that have PR (for panel report) in them and are html.
filenames = filter(lambda a: a.endswith('.html') and 'PR' in a, filenames) 

for filename in filenames: # loop over filenames
    all_dta.append(citation_info(filename, directory)) # extract citation info
    counter = counter + 1
    if counter % 10 == 0:
        print '%s of %s files scanned' % (counter, len(filenames))
all_dta = filter(lambda a: a != None, all_dta)
keys = all_dta[0].keys()
# store everything in the_output.csv
with open('C:/Users/<username>/<location>/data/the_output.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_dta)

In [None]:
def citation_info_all(filename, directory):
    # This is the same function as above, only that it collects citation info for the entire text.
    if 'Pt' in filename and not 'Pt1.' in filename:
        return None
    elif 'Pt1.' in filename:
        document_text_parts = []
        i = 1
        file_trigger = 0
        htm = ''
        while file_trigger == 0:
            filename_parts = re.sub('Pt1', 'Pt'+str(i), filename)
            try:
                with open(directory + '/' + filename_parts,'rU') as f:
                    htm_part = f.read()
                    htm = htm + htm_part
                html = htm_part.decode('ascii', 'ignore')
                document_text_parts.append(html2text.html2text(html))
                i = i +1
            except:
                file_trigger = 1
        entire_text = ('\n\n').join(document_text_parts)
    else:
        with open(directory + '/' + filename,'rU') as f:
            htm = f.read()
        html = htm.decode('ascii', 'ignore')
        entire_text = html2text.html2text(html)
    rule_trigger = 0
    htm = re.sub(r'ADDITIONAL FINDINGS', 'ADDITIONAL FINDINGZ', htm)
    htm = re.sub(r'ON THE FINDINGS AND CONCLUSIONS', 'ON THE FINDINGZ AND KONCLUSIONS', htm)
    htm = re.sub(r'SUMMARY OF FINDINGS', 'SUMMARY OF FINDINGZ', htm)
    htm = re.sub(r'OUR MAIN FINDINGS', 'OUR MAIN FINDINGZ', htm)
    htm = re.sub(r'LIMIT ITS FINDINGS', 'LIMIT ITS FINDINGZ', htm)
    htm = re.sub(r'PUBLIC FACTUAL FINDINGS', 'PUBLIC FACTUAL FINDINGZ', htm)
    htm = re.sub(r'FINDINGS OF THE PANEL AND APPELLATE BODY', 'FINDINGZ OF THE PANEL AND APPELLATE BODY', htm)
    htm = re.sub('FINDINGS OF THE PANEL AND THE APPELLATE BODY', 'FINDINGZ OF THE PANEL AND THE APPELLATE BODY', htm)
    htm = re.sub('REVIEW OF THE FINDINGS BY THE US INVESTIGATING', 'REVIEW OF THE FINDINGZ BY THE US INVESTIGATING', htm)
    
    htm = re.search(r'(<(?!/).*)', htm, re.DOTALL).group()
    htm = re.sub('</i>- <i>', '- ', htm)
    htm = re.sub('</i> - <i>', ' - ', htm)
    htm = re.sub('</i> -<i>', ' -', htm)
    htm = re.sub('</i>– <i>', '– ', htm)
    htm = re.sub('</i> – <i>', ' – ', htm)
    htm = re.sub('</i> –<i>', ' –', htm)
    htm_text = htm.decode('ascii', 'ignore')
    htm_text = html2text.html2text(htm_text)
    htm_text = re.sub('\n', ' ', htm_text)
    htm_text = re.sub('  +', ' ', htm_text)
    htm_text = re.sub('that contracting party was entitled,', 'that contracting party was entitled,"', htm_text)
    text = BeautifulSoup(htm)
    elements = text.find_all('i')
    matches_var1 = []
    matches_var2 = []
    for element in elements:
        new = element.get_text().encode('utf8', 'ignore')
        if ' – ' in new or ' - ' in new:
            matches_var1.append(new)    
    
    document_text = re.sub(r'[).,(;:]', ' ', htm_text.lower())
    document_text = re.sub(r'\s+', ' ', document_text)
    matches_wto = re.findall(r'(wt/ds[0-9]+.+?) ', document_text)
    matches_gatt = re.findall(r'(bisd.+?) ', document_text)
    unique_matches_wto = list(set(matches_wto))
    match_dict = {}
    for match in unique_matches_wto:  
        match_dict[match] = matches_wto.count(match)
    bad = max(match_dict, key=match_dict.get)
    frequency = round(match_dict[bad] * 0.8, 0)
    bad_triggers = {match: freq for match, freq in match_dict.iteritems() if freq > frequency}.keys()
    triggers = []
    for trigger in bad_triggers:
        bad_trigger = re.search(r'ds[0-9]+?/', trigger).group()
        triggers.append(bad_trigger)
        if bad_trigger.startswith('ds00'):
            triggers.append(re.sub('ds00', 'ds', bad_trigger))
        elif bad_trigger.startswith('ds0'):
            triggers.append(re.sub('ds0', 'ds', bad_trigger))
    triggers = set(triggers)
    for row in matches_wto:
        if any(trigger in row for trigger in triggers):
            matches_wto = filter(lambda a: a != row, matches_wto)
    matches_var2 = matches_gatt + matches_wto
    if len(matches_var1) > len(matches_var2):
        matches = matches_var1
    else:
        matches = matches_var2
    entire_text = re.sub('[^A-Za-z0-9 ]+', '', entire_text)
    entire_text = re.sub('\n', ' ', entire_text)
    entire_text = re.sub('  +', ' ', entire_text)
    num_words_total = len(entire_text.split())
    htm_text = re.sub('[^A-Za-z0-9 ]+', '', htm_text)
    num_words = len(htm_text.split())
    if num_words_total < num_words:
        num_words_total = num_words
   
    abr_counter = 0
    gatt_counter = 0
    cit_counter = 0
    case_variety = []
    abr_variety = []
    all_years = []
    mean_year = None
    for match in matches:
        the_dict = None
        try:
            the_dict = filter(lambda the_citation: the_citation['filename'] == filename and
                              the_citation['citations'] == match, citation_format)[0]
        except:
            pass
        if the_dict != None:
            abr_counter = abr_counter + int(the_dict['ab_report'])
            if int(the_dict['ds_num']) == 1000:
                gatt_counter = gatt_counter + 1
            if int(the_dict['ab_report']) == 1:
                abr_variety.append(the_dict['ds_num'])
            case_variety.append(the_dict['ds_num'])
            cit_counter = cit_counter + 1
    if abr_variety != []:
        for ds_num in abr_variety:
            all_years.append(filter(lambda ab_info: ab_info['ds_num'] == str(ds_num), ab_year)[0]['year'])
        all_years = filter(lambda year: year != None, all_years)
        mean_year = round(sum(all_years) / len(all_years),2)
        
    my_dict = {}
    my_dict['prec_abs'] = cit_counter
    my_dict['gatt_abs'] = gatt_counter
    my_dict['wto_abs'] = cit_counter - gatt_counter
    my_dict['cite_variety'] = len(set(case_variety))
    my_dict['abr_variety'] = len(set(abr_variety))
    my_dict['abr_abs'] = abr_counter
    my_dict['panel_abs'] = cit_counter - abr_counter
    my_dict['num_words'] = num_words
    my_dict['num_words_total'] = num_words_total
    my_dict['prec_abs_italic'] = len(matches_var1)
    my_dict['prec_abs_abr'] = len(matches_var2)
    my_dict['mean_ab_year'] = mean_year
        
    rx1 = re.compile('(.*)PT(.*)') # Regular expression that checks if filename consists of multiple parts.
    rx2 = re.compile('(.*)Pt(.*)')
    if re.match(rx1, filename): # if it does, delete the part with 'PT' in it
        filename_list = filename.split('_')
        filename_list = [string for string in filename_list if not re.match(rx1, string)]
        name = '_'.join(filename_list)+'.html'  
    elif re.match(rx2, filename): # if it does, delete the part with 'PT' in it
        filename_list = filename.split('_')
        filename_list = [string for string in filename_list if not re.match(rx2, string)]
        name = '_'.join(filename_list)+'.html'
    else:
        name = filename
    my_dict['decision'] = name
    my_dict['filename'] = filename
    return my_dict