Skip to content

Commit

Permalink
Added some CORD-specific edits
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Lease Morgan committed Apr 11, 2020
1 parent a43a433 commit 2e04566
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 16 deletions.
70 changes: 70 additions & 0 deletions bin/cord2carrel-map.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env bash

# cord2carrel-map.sh - given the CORD data set, create "study carrel plus", but just map (no reduce)

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame and distributed under a GNU Public License

# April 4, 2020 - first investigations


# enhance environment
PERL_HOME='/export/perl/bin'
JAVA_HOME='/export/java/bin'
PYTHON_HOME='/export/python/bin'
PATH=$PYTHON_HOME:$PERL_HOME:$JAVA_HOME:$PATH
export PATH

# configure
CARRELS='/export/reader/carrels'
CORPUS="./etc/reader.txt"
DB='./etc/reader.db'
REPORT='./etc/report.txt'

# require
DB2REPORT='/export/reader/bin/db2report.sh'
INITIALIZECARREL='/export/reader/bin/initialize-carrel.sh'
MAP='/export/reader/bin/map.sh'
METADATA2SQL='/export/reader/bin/metadata2sql.py'
REDUCE='/export/reader/bin/reduce.sh'

# get the name of newly created directory
NAME=$( pwd )
NAME=$( basename $NAME )
echo "Carrel name: $NAME" >&2


echo "Here, make sure txt directory and metadata file exit" >&2


# create a study carrel
echo "Creating study carrel named $NAME" >&2
$INITIALIZECARREL $NAME

# unzip the zip file and put the result in the cache
echo "Reading metadata file and updating bibliogrpahics" >&2
METADATA="$CARRELS/$NAME/metadata.csv"
$METADATA2SQL $METADATA > ./tmp/bibliographics.sql
echo "=== updating bibliographic database" >&2
echo "BEGIN TRANSACTION;" > ./tmp/update-bibliographics.sql
cat ./tmp/bibliographics.sql >> ./tmp/update-bibliographics.sql
echo "END TRANSACTION;" >> ./tmp/update-bibliographics.sql
cat ./tmp/update-bibliographics.sql | sqlite3 $DB

# build the carrel; the magic happens here
echo "Building study carrel named $NAME" >&2

# extract parts-of-speech, named entities, etc
$MAP $NAME

# build ./etc/reader.txt; a plain text version of the whole thing
echo "Building ./etc/reader.txt" >&2
rm -rf $CORPUS >&2
find "./txt" -name '*.txt' -exec cat {} >> "$CORPUS" \;
tr '[:upper:]' '[:lower:]' < "$CORPUS" > ./tmp/corpus.001
tr '[:digit:]' ' ' < ./tmp/corpus.001 > ./tmp/corpus.002
tr '\n' ' ' < ./tmp/corpus.002 > ./tmp/corpus.003
tr -s ' ' < ./tmp/corpus.003 > "$CORPUS"

# done
exit
4 changes: 1 addition & 3 deletions bin/cord2carrel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,13 @@ REDUCE='/export/reader/bin/reduce.sh'
NAME=$( pwd )
NAME=$( basename $NAME )
echo "Carrel name: $NAME" >&2
echo >&2


echo "Here, make sure the txt and cache directories exist and contain data; make sure metadata file exists too" >&2
echo "Here, make sure txt directory and metadata file exit" >&2


# create a study carrel
echo "Creating study carrel named $NAME" >&2
echo "" >&2
$INITIALIZECARREL $NAME

# unzip the zip file and put the result in the cache
Expand Down
33 changes: 20 additions & 13 deletions bin/txt2urls.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,26 @@ LEAF=$( basename "$FILE" .txt )
mkdir -p "$ORIGINAL/../$URLS"
OUTPUT="$ORIGINAL/../$URLS/$LEAF.url"

# get the data
RECORDS=$(cat "$FILE" | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' | sed -e 's/\W+$//g' | sed -e 's/\,$//g'| sed -e 's/\;$//g' | sed -e 's/\.$//g' | sed -e 's/)$//g' )

# if content found, then extract domain and output
SIZE=${#RECORDS}
if [[ $SIZE > 0 ]]; then

# proces each item in the data
printf "id\tdomain\turl\n" > "$OUTPUT"
while read -r RECORD; do
DOMAIN=$(echo $RECORD | sed -e 's/http:\/\///g' | sed -e 's/\/.*$//g')
echo -e "$LEAF\t$DOMAIN\t$RECORD" >> "$OUTPUT"
done <<< "$RECORDS"
# optionally, do the work
if [ -f "$OUTPUT" ]; then
echo "$OUTPUT exist" >&2

else
# get the data
RECORDS=$(cat "$FILE" | egrep -o 'https?://[^ ]+' | sed -e 's/https/http/g' | sed -e 's/\W+$//g' | sed -e 's/\,$//g'| sed -e 's/\;$//g' | sed -e 's/\.$//g' | sed -e 's/)$//g' )

# if content found, then extract domain and output
SIZE=${#RECORDS}
if [[ $SIZE > 0 ]]; then

# proces each item in the data
printf "id\tdomain\turl\n" > "$OUTPUT"
while read -r RECORD; do
DOMAIN=$(echo $RECORD | sed -e 's/http:\/\///g' | sed -e 's/\/.*$//g')
echo -e "$LEAF\t$DOMAIN\t$RECORD" >> "$OUTPUT"
done <<< "$RECORDS"

fi

fi

0 comments on commit 2e04566

Please sign in to comment.