aboutsummaryrefslogtreecommitdiff
path: root/R2R/ingesting
diff options
context:
space:
mode:
Diffstat (limited to 'R2R/ingesting')
-rw-r--r--R2R/ingesting/README.md21
-rwxr-xr-xR2R/ingesting/aging_pt1.json1
-rwxr-xr-xR2R/ingesting/cmds.txt1
-rwxr-xr-xR2R/ingesting/ingest_my_data.py99
4 files changed, 122 insertions, 0 deletions
diff --git a/R2R/ingesting/README.md b/R2R/ingesting/README.md
new file mode 100644
index 00000000..4a3951f6
--- /dev/null
+++ b/R2R/ingesting/README.md
@@ -0,0 +1,21 @@
+
+<p align="left">
+ <a href="https://r2r-docs.sciphi.ai"><img src="https://img.shields.io/badge/docs.sciphi.ai-3F16E4" alt="Docs"></a>
+ <a href="https://discord.gg/p6KqD2kjtB"><img src="https://img.shields.io/discord/1120774652915105934?style=social&logo=discord" alt="Discord"></a>
+ <a href="https://github.com/SciPhi-AI"><img src="https://img.shields.io/github/stars/SciPhi-AI/R2R" alt="Github Stars"></a>
+ <a href="https://github.com/SciPhi-AI/R2R/pulse"><img src="https://img.shields.io/github/commit-activity/w/SciPhi-AI/R2R" alt="Commits-per-week"></a>
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-purple.svg" alt="License: MIT"></a>
+</p>
+
+<img src="../assets/r2r.png" alt="R2R Answer Engine">
+<h3 align="center">
+Ingesting a library
+</h3>
+
+# About
+R2R was designed to bridge the gap between local LLM experimentation and scalable, production-ready Retrieval-Augmented Generation (RAG). R2R provides a comprehensive and SOTA RAG system for developers, built around a RESTful API for ease of use.
+
+ingest_my_data.py in the directory [[ingesting]] shows a process you can use to place multiple pdf files into the RAGs context.
+Because there are limits to how much data can be uploaded and processed by different LLM providers, there are wait times coded into the ingest process.
+
+In the code there are many directories, because the upload limit was around 35 megabytes, and references existed across many years. \ No newline at end of file
diff --git a/R2R/ingesting/aging_pt1.json b/R2R/ingesting/aging_pt1.json
new file mode 100755
index 00000000..27678257
--- /dev/null
+++ b/R2R/ingesting/aging_pt1.json
@@ -0,0 +1 @@
+{'processed_documents': ["Document '2001 - Human Genome and Diseases.pdf' processed successfully.", "Document '2001 - Demography in the age of genomics.pdf' processed successfully.", "Document '2002 - Roles of the Werner syndrome protein in pathways required.pdf' processed successfully.", "Document '2003 - A Method for Detecting Recent Selection in the Human Genome.pdf' processed successfully.", "Document '2004 - Human blood genomics distinct profiles for gender, age.pdf' processed successfully.", "Document '1996 - Livestock Genomics Comes of Age.pdf' processed successfully.", "Document '2003 - The application of functional genomics.pdf' processed successfully.", "Document '2004 - Genomic instability, aging.pdf' processed successfully.", "Document '2002 - Influence of age, sex, and strength training.pdf' processed successfully.", "Document '2001 - Statistical tests of selective neutrality.pdf' processed successfully.", "Document '1997 - Genetic and functional changes in mitochondria associated with aging.pdf' processed successfully.", "Document '2003 - From basepairs to birdsongs phylogenetic data.pdf' processed successfully.", "Document '2004 - Comparative analysis of processed pseudogenes.pdf' processed successfully.", "Document '2002 - Genome Dynamics in Aging Mice.pdf' processed successfully.", "Document '2004 - Cytonuclear coevolution the genomics.pdf' processed successfully.", "Document '1999 - Functional integrity of mitochondrial genomes in human plateletsand autopsied brain tissues from elderly patients withAlzheimer’s disease.pdf' processed successfully.", "Document '2002 - Large genome rearrangements as a primary cause of aging.pdf' processed successfully.", "Document '2002 - Retroelement Distributions in the Human.pdf' processed successfully.", "Document '2001 - Marked differences in unilateral isolated retinoblastomas.pdf' processed successfully.", "Document '2004 - The emergence of epidemiology.pdf' processed successfully.", "Document '2003 - Functional Genomics of Ageing.pdf' processed successfully.", "Document '1999 - Functional Genomics and Rat Models.pdf' processed successfully.", "Document '2004 - A genome scan for diabetic nephropathy in African Americans.pdf' processed successfully.", "Document '2004 - Endosymbiotic gene transfer.pdf' processed successfully.", "Document '2003 - Gene expression profile of aging in human muscle.pdf' processed successfully.", "Document '2002 - Mitochondrial dysfunction leads to telomere attrition.pdf' processed successfully.", "Document '2004 - Proinflammatory phenotype of coronary arteries promotes endothelial.pdf' processed successfully.", "Document '2003 - Age-related changes in the transcriptional profile of mouse RPE choroid.pdf' processed successfully.", "Document '1996 - Isolation and characterization of genomic and cDNA clones encoding.pdf' processed successfully.", "Document '1999 - Molecular Biology of Aging.pdf' processed successfully.", "Document '2004 - RNA-interference-based functional genomics in mammalian cells.pdf' processed successfully.", "Document '1982 - Is Cell Aging Caused by Respiration-Dependent Injury to the Mitochondrial Genome.pdf' processed successfully.", "Document '2004 - Comparing genomic expression patterns across species.pdf' processed successfully.", "Document '1989 - Genetic instability and aging theories, facts, and future perspectives.pdf' processed successfully.", "Document '2004 - Evolutionary history of Oryza sativa LTR retrotransposons.pdf' processed successfully.", "Document '2004 - Genome-Wide Scan for a Healthy Aging.pdf' processed successfully.", "Document '2000 - A Major Gene Affecting Age-Related Hearing Loss Is Common.pdf' processed successfully.", "Document '2001 - Fungal virulence studies come of age.pdf' processed successfully.", "Document '1999 - The caenorhabditis elegans genome.pdf' processed successfully.", "Document '2001 - Plant Systematics in the Age of Genomics.pdf' processed successfully.", "Document '2001 - Vitamins minerals and genomic stability in humans.pdf' processed successfully.", "Document '2000 - Genome-wide study of aging and oxidative stress.pdf' processed successfully.", "Document '2001 - Genome maintenance mechanisms.pdf' processed successfully.", "Document '2002 - Signatures of Domain Shuffling.pdf' processed successfully.", "Document '2004 - Age-Related Macular Degeneration A High-Resolution Genome Scan.pdf' processed successfully.", "Document '1987 - Genomic 5-Methyldeoxycytidine.pdf' processed successfully.", "Document '2002 - Parkinson’s Disease and Apolipoprotein E Possible.pdf' processed successfully.", "Document '2002 - Functional genomics the coming.pdf' processed successfully.", "Document '2003 - Mitochondrial DNA modifies cognition in interaction.pdf' processed successfully.", "Document '2001 - Methylation meets genomics.pdf' processed successfully.", "Document '2003 - Life-long reduction in MnSOD activity results.pdf' processed successfully.", "Document '2004 - A Transcriptional Profile of Aging.pdf' processed successfully.", "Document '1999 - Rothmund–Thomson Syndrome Responsible Gene, RECQL4.pdf' processed successfully.", "Document '2003 - Caloric restriction promotes genomic stability by induction.pdf' processed successfully.", "Document '2003 - Epigenetic regulation of gene expression how the genome integrates intrinsic and environmental signals.pdf' processed successfully.", "Document '2004 - Genome-wide RNA interference screen identifies.pdf' processed successfully.", "Document '2000 - A full genome scan for age-related maculopathy.pdf' processed successfully.", "Document '1997 - Bioinformatics in a post-genomics age.pdf' processed successfully.", "Document '1990 - Extrachromosomal circular DNAs and genomic sequence plasticity.pdf' processed successfully.", "Document '2004 - Nutritional genomics.pdf' processed successfully.", "Document '2004 - Microbial Genomics and the Periodic Table.pdf' processed successfully.", "Document '1997 - Age associated alterations of the mitochondrial genome.pdf' processed successfully.", "Document '2002 - Genome-Wide Transcript Profiles in Aging.pdf' processed successfully.", "Document '2003 - Genetics, genes, genomics and g.pdf' processed successfully.", "Document '1999 - Cell-by-cell scanning of whole mitochondrial genomes.pdf' processed successfully.", "Document '2004 - Prevalence and correlates of orthostatic.pdf' processed successfully.", "Document '2000 - From life to death – the struggle between chemistry and biology during.pdf' processed successfully.", "Document '1998 - The bottleneck mitochondrial imperatives in oogenesis and ovarian.pdf' processed successfully.", "Document '1993 - Genomic Damage and Its Repair.pdf' processed successfully.", "Document '2001 - Mitochondrial genome instability in human cancers.pdf' processed successfully.", "Document '2001 - The genetics of aging.pdf' processed successfully.", "Document '2003 - Lifelong voluntary exercise in the mouse prevents.pdf' processed successfully.", "Document '2004 - Age-associated alteration of gene expression.pdf' processed successfully.", "Document '1999 - Qualitative assessment of Genotoxicity.pdf' processed successfully.", "Document '2001 - Genomic profiling of short- and long-term caloric.pdf' processed successfully.", "Document '2004 - Ageing, repetitive genomes and DNA.pdf' processed successfully.", "Document '2004 - Genomic DNA methylation of juvenile and mature Acacia mangium.pdf' processed successfully.", "Document '2003 - Welcome to the Genomic Era.pdf' processed successfully.", "Document '2003 - Whole-genome screening indicates a possible burst of formation.pdf' processed successfully.", "Document '2003 - Population Screening in the Age.pdf' processed successfully.", "Document '1997 - Tumour-suppressor genes evolving definitions in the genomic age.pdf' processed successfully.", "Document '2003 - Results of a high-resolution genome screen.pdf' processed successfully.", "Document '2003 - A Whole-Genome Screen of a Quantitative Trait of Age-Related.pdf' processed successfully.", "Document '1997 - Multi-organ characterization of mitochondrial.pdf' processed successfully.", "Document '2003 - Risk Factors for Hodgkin’s Lymphoma by EBV.pdf' processed successfully.", "Document '2002 - Human mitochondrial DNA with large deletions.pdf' processed successfully.", "Document '2002 - Genomic DNA methylation–demethylation during aging.pdf' processed successfully.", "Document '2003 - Age-related impairment of the transcriptional.pdf' processed successfully.", "Document '1995 - Rearranged Mitochondrial Genomes Are Present in Human.pdf' processed successfully.", "Document '2002 - Population Genomics Ageing.pdf' processed successfully.", "Document '2003 - The Dawning era of polymer.pdf' processed successfully.", "Document '2001 - A genome-wide scan for linkage to human.pdf' processed successfully.", "Document '2003 - Molecular mechanisms of reduced adrenergic signaling.pdf' processed successfully.", "Document '2002 - Telomere dysfunction provokes regional amplification.pdf' processed successfully.", "Document '2004 - Mitochondrial Genome Single Nucleotide.pdf' processed successfully.", "Document '1991 - Detection of Epstein-Barr virus genomes in Hodgkin's disease relation to age.pdf' processed successfully.", "Document '1998 - Microarrays biotechnology’s discovery.pdf' processed successfully.", "Document '2003 - The age of the Arabidopsis thaliana genome duplication.pdf' processed successfully.", "Document '2004 - Whole Genome Scan for Obstructive Sleep Apnea.pdf' processed successfully."], 'failed_documents': ["Document '2003 - Telomeres in the chicken genome stability and chromosome ends.pdf': Error code: 429 - {'error': {'message': 'Rate limit reached for text-embedding-3-small in organization org-p90FbCDZwTfuyHzyjGoHvDaR on tokens per min (TPM): Limit 1000000, Used 996746, Requested 12291. Please try again in 542ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}", "Document '1983 - Mitochondrial DNA in Mortal and Immortal Human Cell.pdf': Error code: 429 - {'error': {'message': 'Rate limit reached for text-embedding-3-small in organization org-p90FbCDZwTfuyHzyjGoHvDaR on tokens per min (TPM): Limit 1000000, Used 996385, Requested 12801. Please try again in 551ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}", "Document '2004 - Additive regulation of hepatic gene expression by dwarfism.pdf': Request timed out.", "Document '2004 - A New Resource for Cereal Genomics 22K Barley.pdf': Request timed out.", "Document '1998 - Molecular archaeology of the Escherichia coli genome.pdf': Request timed out.", "Document '2003 - Imaging genomics.pdf': Request timed out.", "Document '2001 - Genomics food and nutrition.pdf': Request timed out.", "Document '2003 - Molecular profiling in the age.pdf': Request timed out."], 'skipped_documents': []}
diff --git a/R2R/ingesting/cmds.txt b/R2R/ingesting/cmds.txt
new file mode 100755
index 00000000..9745b929
--- /dev/null
+++ b/R2R/ingesting/cmds.txt
@@ -0,0 +1 @@
+Time taken to ingest files: 541.67 secondsngest-files ~/SolomonVentures/GeneNetwork/docRequesting...N/Gene\ Network.org\ PDFs/*.pdf \ No newline at end of file
diff --git a/R2R/ingesting/ingest_my_data.py b/R2R/ingesting/ingest_my_data.py
new file mode 100755
index 00000000..cb0fa551
--- /dev/null
+++ b/R2R/ingesting/ingest_my_data.py
@@ -0,0 +1,99 @@
+from os import listdir
+from os.path import isfile, join
+import time
+import datetime
+import configparser
+from r2r import R2RClient
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272")
+
+#print("The status of the client is {0}".format(client.health()))
+
+# should be read from a configuration file
+main_file_dir = 'full path'
+
+data_dir = cfg['PDF_DIR']
+
+file_paths = [
+ data_dir+cfg['genetics']['diabetes'],
+ data_dir+cfg['genetics']['aging']
+]
+
+print("The defined file paths {0}".format(file_paths))
+
+"""
+file_paths = [
+ main_file_dir+'pt02/b/',
+ main_file_dir+'pt03/c/',
+ main_file_dir+'pt03/a/',
+ main_file_dir+'pt03/b/',
+ main_file_dir+'pt03/c/',
+ main_file_dir+'pt04/a/',
+ main_file_dir+'pt04/b/',
+ main_file_dir+'pt04/c/',
+ main_file_dir+'pt05/a/',
+ main_file_dir+'pt05/b/',
+ main_file_dir+'pt05/c/',
+ main_file_dir+'pt06/a/',
+ main_file_dir+'pt06/b/',
+ main_file_dir+'pt06/c/',
+ main_file_dir+'pt07/a/',
+ main_file_dir+'pt07/b/',
+ main_file_dir+'pt07/c/',
+ main_file_dir+'pt08/a/',
+ main_file_dir+'pt08/b/',
+ main_file_dir+'pt08/c/',
+ main_file_dir+'pt09/a/',
+ main_file_dir+'pt09/b/',
+ main_file_dir+'pt09/c/',
+ main_file_dir+'pt10/a/',
+ main_file_dir+'pt10/b/',
+ main_file_dir+'pt10/c/',
+ main_file_dir+'pt11/a/',
+ main_file_dir+'pt11/b/',
+ main_file_dir+'pt11/c/',
+ main_file_dir+'pt12/a/',
+ main_file_dir+'pt12/b/',
+ main_file_dir+'pt12/c/',
+ main_file_dir+'pt13/a/',
+ main_file_dir+'pt13/b/',
+ main_file_dir+'pt13/c/',
+ main_file_dir+'pt14/a/',
+ main_file_dir+'pt14/b/',
+ main_file_dir+'pt14/c/',
+ main_file_dir+'pt14/d/'
+ ]
+
+
+ndx = 0
+for file_list in file_paths:
+ the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
+ '''
+ # subroutine to test list content
+ for the_pdf in the_pdfs:
+ print('{0} -> {1}'.format(ndx, the_pdf))
+ ndx += 1
+ '''
+ print(datetime.datetime.now())
+ begin_ingesting = datetime.datetime.now()
+ sleeptime = 30
+ try:
+ ingest_response = client.ingest_files(
+ file_paths=the_pdfs
+ )
+ except:
+ ingest_response = "Nothing ingested from {0}".format(file_list)
+ sleeptime = 1
+
+ end_ingesting = datetime.datetime.now()
+
+ # show results of ingesting documents
+ print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+ # brace against pinging API too quickly
+ time.sleep(sleeptime)
+ ndx += 1
+""" \ No newline at end of file