Main Page | Namespace List | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | Examples

pdb_split.cc

This example shows how to split a pdb file various ways. A pdb can be split into different models, chains or event cut a particular chain.

/* Copyright 2004
   Stanford University

   This file is part of the DSR PDB Library.

   The DSR PDB Library is free software; you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as published by
   the Free Software Foundation; either version 2.1 of the License, or (at your
   option) any later version.

   The DSR PDB Library is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
   License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with the DSR PDB Library; see the file COPYING.LIB.  If not, write to
   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
   MA 02111-1307, USA. */

#include "dsrpdb/PDB.h"
#include <cstdlib>
#include <iostream>
#include <fstream>

#ifdef PDB_USE_BOOST_PROGRAM_OPTIONS
#include <boost/program_options.hpp>
#endif

int main(int argc, char *argv[]){
  bool split_chains=false;
  std::string input_file, output_template;
  int split_domain=-1;
  bool print_help=false;
  bool verbose=false;
  char split_chain='\0';

#ifdef PDB_USE_BOOST_PROGRAM_OPTIONS
  boost::program_options::options_description o("Allowed options"), po, ao;
  o.add_options()
    ("help", boost::program_options::bool_switch(&print_help),
     "produce help message")
    ("verbose,v", boost::program_options::bool_switch(&verbose),
     "print out verbose messages about reading and writing pdb files")
    ("split-chains,c", boost::program_options::bool_switch(&split_chains),
     "Split all chains into separate files.")
    ("select-chain,C", boost::program_options::value<char>(&split_chain),
      "Select this chain only.")
    ("domain-split,s", boost::program_options::value<int>(&split_domain),
     "Split a chain into domains at this residue.");
  po.add_options()
    ("input-pdb", boost::program_options::value< std::string>(&input_file),
     "input file")
    ("output-pdb-template", boost::program_options::value< std::string>(&output_template),
     "A sprintf style string that will be used to generate the names for the output files.");

  ao.add(o).add(po);

  boost::program_options::positional_options_description p;
  p.add("input-pdb", 1);
  p.add("output-pdb-template", 2);

  boost::program_options::variables_map vm;
  boost::program_options::store(boost::program_options::command_line_parser(argc, argv).
                                options(ao).positional(p).run(), vm);
  boost::program_options::notify(vm);


 
  //boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
  //boost::program_options::notify(vm);    

  if (input_file.empty()
      || output_template.empty()
      || (split_chains && split_chain != '\0') 
      || (split_chains && split_domain != -1) 
      || (split_chain != '\0' && split_domain != -1) 
      || print_help) {
    std::cout << "This program splits a pdb file with multiple models or multiple domains into multiple files each with one model .\n";
    std::cout << "useage: " << argv[0] 
              << " [-c] input-pdb template%d[%c].pdb\n" << std::endl;
    std::cout << "The second argument is an sprintf style string that will be used to generate the names for the output files.\n\n";
    std::cout << o << "\n";
    return EXIT_FAILURE;
  }

#else
  if (argc != 3){
    std::cerr << "The program is built without boost/program_options.hpp.\n";
    std::cerr << "useage: " << argv[0] << " [-c] file.pdb template%d[%c].pdb" << std::endl;
    return EXIT_FAILURE;
  }
  input_file = argv[1];
  output_template = argv[2];
#endif

  if (split_chain != '\0'){ 
    std::cout << "Splitting on chain " << split_chain << std::endl;
  } else if (split_domain != -1) {
    std::cout << "Splitting on residue " << split_domain << std::endl;
  } else {
    std::cout << "Splitting into chains " << std::endl;
  }


  // std::cout << input_file << " " << output_template << " " << split_domain << " " << split_chains << std::endl;

  std::ifstream in(input_file.c_str());
  if (!in) {
    std::cerr << "Error opening input file " << input_file << std::endl;
    return EXIT_FAILURE;
  }

  //= new char[strlen(argv[2]+1000)];
  dsrpdb::PDB pdb(in, verbose);

  std::cout << "Input PDB has " << pdb.number_of_models() << " models." << std::endl;

  if (split_domain != -1 && pdb.number_of_models()!= 1){
    std::cerr << "Splitting a domain can only work if there is only one model and chain.\n";
    return EXIT_FAILURE;
  }

  for (unsigned int i=0; i< pdb.number_of_models(); ++i){
    const dsrpdb::Model &m= pdb.model(i);
    if (split_chain != '\0' || split_chains || split_domain!= -1) {
      std::cout << "Model " << i << " has " << m.number_of_chains() << " chains."<< std::endl;
      if (split_domain != -1 && m.number_of_chains()!= 1){
        std::cerr << "Splitting a domain can only work if there is only one model and chain.\n";
        return EXIT_FAILURE;
      }
      for (unsigned int j=0; j< m.number_of_chains(); ++j){
        const dsrpdb::Protein &p= m.chain(j);
        
        if (split_chain != '\0') {
          if (p.chain() == split_chain || p.chain() == std::toupper(split_chain) ) {
            std::cout << "Writing chain " << p.chain() << std::endl;
            assert(split_domain ==-1);
           
            std::ofstream out(output_template.c_str());
            dsrpdb::PDB npdb;
            dsrpdb::Model nm;
            nm.new_chain(p);
            npdb.new_model(nm);
            npdb.set_header(pdb.header_begin(), pdb.header_end());
            npdb.write(out);
          } else {
            std::cout << "Skipping chain " << p.chain() << std::endl;
          }
        } else if (split_chains) {
          std::cout << "Writing chain " << p.chain() << std::endl;
          assert(split_domain ==-1);
          char buf[100000];
          if (pdb.number_of_models()==1) {
            sprintf(buf, output_template.c_str(),p.chain());
          } else {
            sprintf(buf, output_template.c_str(), m.index(), p.chain());
          }
          std::ofstream out(buf);
          dsrpdb::PDB npdb;
          dsrpdb::Model nm;
          nm.new_chain(p);
          npdb.new_model(nm);
          npdb.set_header(pdb.header_begin(), pdb.header_end());
          npdb.write(out);
        } else {
          dsrpdb::Protein ps[2]={dsrpdb::Protein(), dsrpdb::Protein()};
          for (dsrpdb::Protein::Const_residues_iterator rit = p.residues_begin(); 
               rit != p.residues_end(); ++rit){
            //rit->write('c', std::cout);
            if (static_cast<unsigned int>(rit->index())
                < static_cast<unsigned int>(split_domain)){
              ps[0].new_residue(*rit);
            } else {
              ps[1].new_residue(*rit);
            }
          }
          
          assert(ps[0].number_of_residues() + ps[1].number_of_residues()
                 == p.number_of_residues());
          
          for (unsigned int j=0; j< 2; ++j){
            std::cout << "Writing pdb with " << ps[j].number_of_residues() << " residues.\n";
            char buf[100000];
            sprintf(buf, output_template.c_str(), j);
            std::ofstream out(buf);
              dsrpdb::PDB npdb;
              dsrpdb::Model nm(0);
              nm.new_chain(ps[j]);
              npdb.new_model(nm);
              npdb.set_header(pdb.header_begin(), pdb.header_end());
              npdb.write(out);
            }
          }
      }
    } else {
      char buf[100000];
      sprintf(buf, argv[2], m.index());
      std::ofstream out(buf);
      dsrpdb::PDB npdb;
      npdb.new_model(m);
      npdb.set_header(pdb.header_begin(), pdb.header_end());
      npdb.write(out);
    }
  }
  //delete[] buf;
  return EXIT_SUCCESS;
}