Skip to article frontmatterSkip to article content

Before we start, we need some data to work with. Let’s create a simple dataset using Python’s pandas library and faker to generate random data.

import faker
import pandas as pd
import logging
import nltk

print(nltk.data.path)
['C:\\Users\\dunn0172/nltk_data', 'c:\\Users\\dunn0172\\Documents\\GitHub\\biorepository_data_wrangling\\.venv\\nltk_data', 'c:\\Users\\dunn0172\\Documents\\GitHub\\biorepository_data_wrangling\\.venv\\share\\nltk_data', 'c:\\Users\\dunn0172\\Documents\\GitHub\\biorepository_data_wrangling\\.venv\\lib\\nltk_data', 'C:\\Users\\dunn0172\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
from faker_biospecimen.study import (
    StudyProvider,
    StudySiteProvider,
    StudyParticipantProvider,
)
from faker_biospecimen.specimen import SpecimenProvider
fake = faker.Faker(locale="en_US")
fake.add_provider(StudyProvider)
fake.add_provider(StudySiteProvider)
fake.add_provider(StudyParticipantProvider)
fake.add_provider(SpecimenProvider)
study = fake.study(generate_sites=5)
print(study)
for site in study.sites:
    print(site)
Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org))
StudySite(site_code=F, name=Perry, Castillo and Smith and Sons, location=59244 Pierce Throughway Suite 953
East Whitneytown, MS 46433, contact=Researcher(name=Alicia Grant MEng MLS(ASCP), email=ashleycopeland@example.org))
StudySite(site_code=W, name=Murillo-Morgan and Sons, location=84941 Blake Underpass
South Raymondchester, IA 87382, contact=Researcher(name=Michele Brown MPH DVM JD, email=matthew09@example.com))
StudySite(site_code=B, name=Pierce-Hurst Ltd, location=7640 Wu Cape
Bellview, VA 11566, contact=Researcher(name=Hayley Gutierrez MBA PharmD, email=blarson@example.org))
StudySite(site_code=L, name=White-Oliver Inc, location=914 Jones Streets
Latoyaside, GU 91429, contact=Researcher(name=Lisa Fields JD DVM, email=imaynard@example.org))
StudySite(site_code=J, name=Mcfarland, Russell and Nelson Ltd, location=77126 Nelson Extension Apt. 818
Port Amanda, MO 81897, contact=Researcher(name=Charles Mccall RN NP, email=michael31@example.net))
participants = [fake.study_participant(study=study, site=site) for site in study.sites]
participants
[StudyParticipant(participant_id=173472, study=Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org)), site=StudySite(site_code=F, name=Perry, Castillo and Smith and Sons, location=59244 Pierce Throughway Suite 953 East Whitneytown, MS 46433, contact=Researcher(name=Alicia Grant MEng MLS(ASCP), email=ashleycopeland@example.org)), enrollment_date=1978-07-23, birth_date=1977-03-01 ), StudyParticipant(participant_id=815930, study=Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org)), site=StudySite(site_code=W, name=Murillo-Morgan and Sons, location=84941 Blake Underpass South Raymondchester, IA 87382, contact=Researcher(name=Michele Brown MPH DVM JD, email=matthew09@example.com)), enrollment_date=1991-11-02, birth_date=1974-07-14 ), StudyParticipant(participant_id=921110, study=Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org)), site=StudySite(site_code=B, name=Pierce-Hurst Ltd, location=7640 Wu Cape Bellview, VA 11566, contact=Researcher(name=Hayley Gutierrez MBA PharmD, email=blarson@example.org)), enrollment_date=1989-10-19, birth_date=1988-04-01 ), StudyParticipant(participant_id=992718, study=Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org)), site=StudySite(site_code=L, name=White-Oliver Inc, location=914 Jones Streets Latoyaside, GU 91429, contact=Researcher(name=Lisa Fields JD DVM, email=imaynard@example.org)), enrollment_date=1988-12-01, birth_date=1981-04-24 ), StudyParticipant(participant_id=440583, study=Study(name=ISOASPARAGINE, description=The ISOASPARAGINE study aims to investigate the interaction between E1 SAMP-activating enzyme and IgG3 in the context of the striated duct. , start_date=1977-03-31, end_date=1992-06-29, principal_investigator=Researcher(name=Kristin Kelly DDS PhD, email=hwilliams@example.org)), site=StudySite(site_code=J, name=Mcfarland, Russell and Nelson Ltd, location=77126 Nelson Extension Apt. 818 Port Amanda, MO 81897, contact=Researcher(name=Charles Mccall RN NP, email=michael31@example.net)), enrollment_date=1982-01-16, birth_date=1975-11-08 )]
specimens = [fake.specimen(subject=participant) for participant in participants]
specimen_df = pd.DataFrame([specimen.to_dict() for specimen in specimens])
specimen_df.head()
Loading...