# for emacs: -*- mode: sh; -*-


# This file describes how the proteins092903 database was made using
# 09/29/03 release of NR database files from SWISS-PROT

o Create and populate sp092903 database according to makeSp092903.doc.

o create proteins092903 mySQL databases at hgwdev
  from interactive mysql

	create database proteins092903;  

o make a subdirectory of:

	/cluster/bluearc/fan/proteins/092903

  and go there.

o dump table definitions from proteins072003

	mysqldump -d proteins072003 -u hgcat -p$HGPSWD > proteins.sql

o edit proteins.sql to commented out index i1, i2, i3, i4 of the spXref2 table

o create tables in proteins

	mysql -u hgcat -p$HGPSWD -A proteins092903 <proteins.sql

o Generate files, spXref2.tab, spXref3.tab and spOrganism.tab, for proteins092903 database from sp092903 database

        spToProteins 092903

  This will take about an hour and half.

o Load the tables

        LOAD DATA local INFILE 'spXref2.tab' into table proteins092903.spXref2;

        create index i1 on proteins092903.spXref2(accession);
        create index i2 on proteins092903.spXref2(displayID);
        create index i3 on proteins092903.spXref2(extAC);
        create index i4 on proteins092903.spXref2(bioentryID);

        LOAD DATA local INFILE 'spXref3.tab' into table proteins092903.spXref3;

        create index ii1 on proteins092903.spXref3(accession);
        create index ii2 on proteins092903.spXref3(displayID);
        create index ii3 on proteins092903.spXref3(hugoSymbol);

    	LOAD DATA local INFILE 'spOrganism.tab' into table proteins092903.spOrganism;

o Build the database table for HUGO

        o ftp over the HUGO data file:
	    cd /cluster/store5/protein/hugo
	    mkdir 100503
	    cd 100503
	    wget http://www.gene.ucl.ac.uk/public-files/nomen/nomeids.txt

        o remove the first line and save the file as hugo.tab

        o load the data, at the mysql prompt

	     LOAD DATA local INFILE 'hugo.tab' into table proteins092903.hugo;

o Build spSecondaryID table

  Use the following sql command to generate 2nd.tab file for the spSecondaryID table.

	select displayId.val, displayId.acc, otherAcc.val from sp092903.displayId, 
	sp092903.otherAcc where otherAcc.acc = displayId.acc;

  Load the table:

    LOAD DATA local INFILE '2nd.tab' into table proteins092903.spSecondaryID;

o Build pfamXref and pfamDesc tables

  Ftp over Pfam-A.full.gz from: 

	ftp.sanger.ac.uk/pub/databases/Pfam/Pfam-A.full.gz

  and save it at /cluster/store5/proteins/pfam/092903

  Uncompress it by

	gzip -d Pfam-A.full.gz

  Run pfamXref to generate pfamAXref.tab and pfamADesc.tab files

	pfamXref proteins092903 /cluster/store5/proteins/pfam/072803/Pfam-A.full pfamADesc.tab pfamAXref.tab

  Load them into mySQL

	load data local infile "pfamADesc.tab" into table proteins092903.pfamDesc;
	load data local infile "pfamAXref.tab" into table proteins092903.pfamXref;

o Build pdbSP table

  Go to SWISS-PROT web page at:

	http://us.expasy.org/cgi-bin/lists?pdbtosp.txt

  Save this page into pdbtosp.htm.
  Then run the pdbSP program:
	
	pdbSP proteins092903
	
  Load the output file, pdbSP.tab, into mySQL

	load data local infile "pdbSP.tab" into table proteins092903.pdbSP

o Build spDisease table

  Create the spDisease table in proteins092903.

	CREATE TABLE spDisease (
  	accession varchar(40) NOT NULL default '',
  	displayID varchar(40) NOT NULL default '',
  	diseaseDesc text,
  	KEY accession (accession),
  	KEY displayID (displayID)
	) TYPE=MyISAM;
	
  Use the following SQL and hgsql to retrieve rows and create spDisease.tab.

select comment.acc, displayId.val, commentVal.val from sp092903.comment, sp092903.commentVal, sp092903.displayId where comment.commentType=19 and commentVal.id=comment.commentVal and displayId.acc=comment.acc;

  Load the table into proteins092903.

     load data local infile "spDisease.tab" into table proteins092903.spDisease;


