Source code for miflowcyt_validate

import requests
import xmljson
import xml.etree.ElementTree as elemTree
from json import dump, load
import os
import jsonbender
import json
from collections import OrderedDict
from jsonschema.validators import RefResolver, Draft4Validator
from validate.jsonschema_validator import validate_instance


[docs]class FlowRepoClient: """ A class that provides functionality to download experiments from the FlowRepository (https://flowrepository.org/), transform the XML into JSON and validate the JSON instances against their JSON schema. The transformation from XML to JSON relies on the JSONBender library (https://github.com/Onyo/jsonbender). """ def __init__(self, mapping, client_id, number_of_items): """ The class constructor :param mapping: the mapping dictionary containing the jsonbender objects (see https://github.com/Onyo/jsonbender) :type mapping: file :param client_id: identifier of the client :type client_id: str """ self.errors = {} self.instances = [] self.main_context_url = "" self.clientID = client_id self.item_number = number_of_items self.bender_mapping_file = mapping self.mapping_url = "https://fairsharing.github.io/mircat/miflowcyt/" \ "schema_context_mapping.json" self.base_schema = "experiment_schema.json" self.schema_url = "https://w3id.org/mircat/miflowcyt/schema/" + self.base_schema
[docs] def get_user_content_id(self): """ Return all IDs found in the user content XML :return: a list of all IDs there were identified in the variable returned by the API """ full_url = "http://flowrepository.org/list?client=" + self.clientID ids = [] response = requests.request("GET", full_url) if response.status_code == 404: return Exception("Verify your client ID (" + self.clientID + ")") else: user_data = xmljson.parker.data((elemTree.fromstring(response.text))) if "public-experiments" in user_data.keys() \ and "experiment" in user_data["public-experiments"].keys(): for experiment in user_data['public-experiments']['experiment']: ids.append(experiment['id']) return ids
[docs] def grab_experiment_from_api(self, item_identifier): """ Retrieve the experimental metadata and return it as XML document object :param item_identifier: the item identifier that should be retrieved :return: the XML document object """ full_url = "http://flowrepository.org/list/" \ + item_identifier \ + "?client=" \ + self.clientID response = requests.request("GET", full_url) if response.status_code == 404 or response.status_code == 400: return Exception("Item %s could not be found" % item_identifier) return response.text
[docs] def get_all_experiments(self, max_number, accessible_ids): """ Grab all experiments from the API for the given number :param max_number: the number of item to retrieve :type max_number: int :param accessible_ids: the ids that this use can fetch :type accessible_ids: list :return: the experiments XMLs """ contents = {} if max_number < len(accessible_ids): for i in range(max_number): current_content = self.grab_experiment_from_api(accessible_ids[i]) contents[accessible_ids[i]] = current_content return contents
[docs] @staticmethod def validate_instance_from_file(instance, item_id, schema_name): """ Method to output the extracted JSON into a file and validate it against the given schema :param instance: the instance to output into a file :param item_id: the instance ID needed to create the file name :param schema_name: the schema to check against :return errors: a list of fields that have an error for this instance """ try: file_name = item_id + '.json' file_full_path = os.path.join(os.path.dirname(__file__), "../tests/data/MiFlowCyt/" + file_name) with open(file_full_path, 'w') as outfile: dump(instance, outfile, indent=4) outfile.close() errors = validate_instance( os.path.join(os.path.dirname(__file__), "../tests/data/MiFlowCyt/"), schema_name, os.path.join(os.path.dirname(__file__), "../tests/data/MiFlowCyt/"), file_name, 1, {}) return errors except FileNotFoundError: return Exception("Please provide a valid schema")
[docs] @staticmethod def get_mapping(mapping_file_name): """ Build the mapping dictionary based on the given mapping file :param mapping_file_name: the name of the mapping file :return mapping: the mapping of the fields """ try: mapping = {} with open(mapping_file_name) as mapping_var: raw_mapping = load(mapping_var) mapping_var.close() # For each mapped field in the mapping file for mapped_item in raw_mapping: # if the value of the field is a string if isinstance(raw_mapping[mapped_item], str): mapping[mapped_item] = jsonbender.OptionalS(raw_mapping[mapped_item]) # if the value of the field is an object elif isinstance(raw_mapping[mapped_item], object): # Raise an error if a value/option is missing if 'value' not in raw_mapping[mapped_item] or \ ('benderOption' not in raw_mapping[mapped_item]): raise Exception("The mapping file is missing a value or the bender option " "for " + mapped_item) else: if raw_mapping[mapped_item]['benderOption'] == "default": mapping[mapped_item] = \ jsonbender.OptionalS(raw_mapping[mapped_item]['value']) elif raw_mapping[mapped_item]['benderOption'] == "raiseErrors": mapping[mapped_item] = jsonbender.S(raw_mapping[mapped_item]['value']) elif raw_mapping[mapped_item]['benderOption'] == "simple": mapping[mapped_item] = jsonbender.K(raw_mapping[mapped_item]['value']) elif raw_mapping[mapped_item]['benderOption'] == "inject": mapping[mapped_item] = jsonbender.F(raw_mapping[mapped_item]['value']) return mapping except FileNotFoundError: return Exception("Mapping file wasn't found")
[docs] def inject_context(self): """ Transform the myflowcyt JSON into a JSON-LD by injecting @context and @type keywords :return: a JSON-LD of the myflowcyt JSON """ instances, errors = self.make_validation() context_mapping = json.loads(requests.get(self.mapping_url).text)["contexts"] self.main_context_url = context_mapping[self.base_schema] for instance_name in instances: instance = instances[instance_name] for field in instance: prop = field + "_schema.json" if prop in context_mapping.keys(): if type(instance[field]) == list: for item in instance[field]: item["@context"] = context_mapping[prop] item["@type"] = field.capitalize() elif type(instance[field]) == dict: instance[field]["@context"] = context_mapping[prop] instance[field]["@type"] = field.capitalize() instance["@context"] = self.main_context_url instance["@type"] = "Experiment" return instances
[docs] def preprocess_content(self, content): """ Preprocess the XML into a JSON that is compliant with the schema. :param content: str containing the XML :type content: str :return: a JSON schema cleaned from residual artifacts """ mapping = self.get_mapping(self.bender_mapping_file) experience_metadata = xmljson.parker.data((elemTree.fromstring( content)))["public-experiments"]["experiment"] extracted_json = jsonbender.bend(mapping, experience_metadata) if extracted_json['organization'] == "\n ": extracted_json['organization'] = [] if extracted_json['keywords'] == "\n ": extracted_json['keywords'] = [] if 'keywords' in extracted_json.keys() and \ type(extracted_json['keywords']) == OrderedDict and \ 'keyword' in extracted_json['keywords'].keys(): if type(extracted_json['keywords']['keyword']) != list: extracted_json['keywords'] = [extracted_json['keywords']['keyword']] else: extracted_json['keywords'] = extracted_json['keywords']['keyword'] if 'organization' in extracted_json.keys() and \ type(extracted_json['organization']) == OrderedDict and \ 'organization' in extracted_json['organization'].keys(): if type(extracted_json['organization']['organization']) != list: extracted_json['organization'] = [extracted_json['organization'][ 'organization']] else: extracted_json['organization'] = extracted_json['organization'][ 'organization'] if 'other' not in extracted_json.keys() \ or extracted_json['other'] is None: extracted_json['other'] = {} if 'related-publications' in extracted_json.keys() and \ type(extracted_json['related-publications']) == OrderedDict and \ 'publication' in extracted_json['related-publications'].keys(): if type(extracted_json['related-publications']['publication']) != list: extracted_json['other']['related-publications'] = [ extracted_json['related-publications']['publication']] else: extracted_json['other']['related-publications'] = \ extracted_json['related-publications']['publication'] if 'related-publications' in extracted_json.keys(): extracted_json.pop('related-publications') for field in extracted_json.keys(): if extracted_json[field] is None: extracted_json[field] = "" if field == "primaryContact": extracted_json["primary_contact"] = extracted_json["primaryContact"] del extracted_json["primaryContact"] return extracted_json
[docs] def make_validation(self): """ Method to run the mapping for the given number of items :return: a dictionary containing the list of errors for all processed items """ valid = {} invalid = {} user_accessible_ids = self.get_user_content_id() # print(user_accessible_ids) if isinstance(user_accessible_ids, Exception): return Exception("Error with client ID " + self.clientID) else: schema = json.loads(requests.get(self.schema_url).text) resolver = RefResolver(self.schema_url, schema, {}) validator = Draft4Validator(schema, resolver=resolver) content = self.get_all_experiments(self.item_number, user_accessible_ids) for raw_experiment in content: experiment = self.preprocess_content(content[raw_experiment]) try: validation = validator.validate(experiment) if validation is None: valid[raw_experiment] = experiment else: invalid[raw_experiment] = validation except Exception as e: invalid[raw_experiment] = "Unexpected error: " + str(e) return valid, invalid