How does a foreachPartition works in sparks? - apache-spark

I am new to apache spark and am trying to run a custom nearest neighbor algorithm on an RDD that has been partitioned into 2 parts using a custom partitioner. The JavaPairRDD contains the graph details and the random object created on the graph.
According to my logic, I am building subgraphs for each partition, and I am running a custom algorithm on each subgraph. It seems to be working "although not properly". I am not sure if this is the correct way to apply action in each partition. I am adding my code and the results as well. Comments and suggestions are highly appreciated.
// <Partition_Index_Key, Map<Source_vertex, Map<Destination Vertex, Tuple2<Edge_Length, ArrayList of Random Objects>>
JavaPairRDD<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>> adjVertForSubgraphsRDD = jscontext
.parallelizePairs(adjacentVerticesForSubgraphs)
.partitionBy(new CustomPartitioner(CustomPartitionSize));
//applying foreachPartition action on JavaPairRDD
adjVertForSubgraphsRDD.foreachPartition(
new VoidFunction<Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
#Override
public void call(
Iterator<Tuple2<Object, Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>>>> tupleRow)
throws Exception {
int sourceVertex;
int destVertex;
double edgeLength;
int roadObjectId;
boolean roadObjectType;
double distanceFromStart;
CoreGraph subgraph0 = new CoreGraph();
CoreGraph subgraph1 = new CoreGraph();
while (tupleRow.hasNext()) {
Map<Object, Map<Object, Tuple2<Double, ArrayList<RoadObject>>>> newMap = tupleRow.next()
._2();
if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 0)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph0.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn0 = new RoadObject();
rn0.setObjId(roadObjectId);
rn0.setType(roadObjectType);
rn0.setDistanceFromStartNode(distanceFromStart);
subgraph0.addObjectOnEdge(currentEdgeId, rn0);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph0.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
} else if ((Integer.parseInt(String.valueOf(tupleRow.next()._1())) == 1)) {
for (Object srcVertex : newMap.keySet()) {
for (Object dstVertex : newMap.get(srcVertex).keySet()) {
if (newMap.get(srcVertex).get(dstVertex)._2() != null) {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
for (int i = 0; i < newMap.get(srcVertex).get(dstVertex)._2()
.size(); i++) {
int currentEdgeId = subgraph1.getEdgeId(sourceVertex, destVertex);
roadObjectId = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getObjectId();
roadObjectType = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getType();
distanceFromStart = newMap.get(srcVertex).get(dstVertex)._2().get(i)
.getDistanceFromStartNode();
RoadObject rn1 = new RoadObject();
rn1.setObjId(roadObjectId);
rn1.setType(roadObjectType);
rn1.setDistanceFromStartNode(distanceFromStart);
subgraph1.addObjectOnEdge(currentEdgeId, rn1);
}
} else {
sourceVertex = Integer.parseInt(String.valueOf(srcVertex));
destVertex = Integer.parseInt(String.valueOf(dstVertex));
edgeLength = newMap.get(srcVertex).get(dstVertex)._1();
subgraph1.addEdge(sourceVertex, destVertex, edgeLength);
}
}
}
}
}
// Straight forward nearest neighbor algorithm from each true to false.
ANNNaive ann = new ANNNaive();
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg0 = ann.compute(subgraph0, true);
System.out.println("for subgraph0");
System.out.println(nearestNeighorPairsSubg0);
System.err.println("-------------------------------");
System.err.println("-------------------------------");
Map<Integer, Integer> nearestNeighorPairsSubg1 = ann.compute(subgraph1, true);
System.out.println("for subgraph1");
System.out.println(nearestNeighorPairsSubg1);
System.err.println("-------------------------------");
}
});

Related

Best Practice to limit NotesViewEntryCollection from getAllEntries

I want to create better web service that display collection from NotesView with pagination.
And I have found some performance issue of View.getAllEntries from bigger view.
On MongoDB, I can use findAll() with skip() and limit().
How can I do like that on Domino ?
Use the ViewNavigator class. If you are paging through a large view, it is much faster than view.getAllEntries().
You can acquire an instance of ViewNavigator with view.createViewNav() or a similar method. For best performance, call view.setAutoUpdate(false) before you acquire the navigator.
You can find lots more information by searching the web. This article looks like a good place to start.
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.ibm.commons.util.io.json.JsonJavaObject;
import lotus.domino.NotesException;
import lotus.domino.View;
import lotus.domino.ViewColumn;
import lotus.domino.ViewEntryCollection;
import lotus.domino.ViewNavigator;
import lotus.domino.ViewEntry;
private String consultView(View view, int counter,int position) throws Exception{
String strValue = "";
ViewNavigator nav;
int count = 0;
view.setAutoUpdate(false);
nav = view.createViewNav();
nav.setEntryOptions(ViewNavigator.VN_ENTRYOPT_NOCOUNTDATA);
nav.setBufferMaxEntries(400);
int limit = counter;
int skippedEntries = nav.skip(position);
String number = "";
if (skippedEntries == position) {
Map<Integer, String> columnNameMap = new HashMap<Integer, String>();
for (ViewColumn col : (List<ViewColumn>) view.getColumns()) {
if (col.getColumnValuesIndex() < 65535) {
columnNameMap.put(col.getColumnValuesIndex(), col.getItemName());
}
}
List nodeData = new ArrayList();
ViewEntry entry = nav.getCurrent();
while (entry != null && count <= (limit - 1)) {
if (!entry.isCategory()) {
try {
HashMap<String, Object> entryMap = new HashMap<String, Object>();
count++;
List<Object> columnValues = entry.getColumnValues();
entryMap.put("unid", entry.getUniversalID());
entryMap.put("position", entry.getPosition('.'));
entryMap.put("pos", entry.getPosition('.'));
entryMap.put("userpos", count);
for (Integer index : columnNameMap.keySet())
entryMap.put(columnNameMap.get(index).toString(),columnValues.get(index));
nodeData.add(entryMap);
} catch (Exception e) {
e.printStackTrace();
}
}
ViewEntry tmpentry = nav.getNext(entry);
entry.recycle();
entry = tmpentry;
}
JsonJavaObject returnJSON = new JsonJavaObject();
returnJSON.put("errorcode", 0);
returnJSON.put("errormessage", "");
returnJSON.put("total",getViewCount(view));
returnJSON.put("data", nodeData);
strValue = returnJSON.toString();
}
nav.recycle();
view.recycle();
return strValue;
}
private int getViewCount(View view) throws NotesException {
int count = 0;
ViewEntryCollection entryCollection = view.getAllEntries();
count = entryCollection.getCount();
entryCollection.recycle();
return count;
}
}
This below function get all AllEntries from view and the outputs result in JSON object. Please try the following and let me know if it works.
private String consultView(View view, int counter,int position) throws Exception{
String strValue = "";
ViewNavigator nav;
int count = 0;
view.setAutoUpdate(false);
nav = view.createViewNav();
nav.setEntryOptions(ViewNavigator.VN_ENTRYOPT_NOCOUNTDATA);
nav.setBufferMaxEntries(400);
int limit = counter;
int skippedEntries = nav.skip(position);
String number = "";
int inde = 111;
if (skippedEntries == position) {
Map<Integer, String> columnNameMap = new HashMap<Integer, String>();
for (ViewColumn col : (List<ViewColumn>) view.getColumns()) {
if (col.getColumnValuesIndex() < 65535 && Utilisties.containsVar(viewObject.getRetCols(), col.getItemName())) {
columnNameMap.put(col.getColumnValuesIndex(), col.getItemName());
}
}
List nodeData = new ArrayList();
ViewEntry entry = nav.getCurrent();
while (entry != null && count <= (limit - 1)) {
if (!entry.isCategory()) {
try {
HashMap<String, Object> entryMap = new HashMap<String, Object>();
count++;
List<Object> columnValues = entry.getColumnValues();
entryMap.put("unid", entry.getUniversalID());
entryMap.put("position", entry.getPosition('.'));
entryMap.put("pos", entry.getPosition('.'));
entryMap.put("userpos", count);
for (Integer index : columnNameMap.keySet())
entryMap.put(columnNameMap.get(index).toString(),columnValues.get(index));
nodeData.add(entryMap);
} catch (Exception e) {
e.printStackTrace();
}
}
ViewEntry tmpentry = nav.getNext(entry);
entry.recycle();
entry = tmpentry;
}
JsonJavaObject returnJSON = new JsonJavaObject();
returnJSON.put("errorcode", 0);
returnJSON.put("errormessage", "");
if(viewObject.getGetCount())
returnJSON.put("total",getViewCount(view));
returnJSON.put("data", nodeData);
strValue = returnJSON.toString();
}
nav.recycle();
view.recycle();
return strValue;

Accessing Map Value

static void getRecommendations(Map<User, HashMap<Item, Double>> map, User to) {
scores = sortMapByScore(scores, to);
TreeMap<User, Double> scores1 = (TreeMap<User, Double>) scores.get(to);
Set<User> user = (Set<User>) scores1.keySet();
Iterator<User> itr = user.iterator();
Map<Item, Double> rec = new HashMap<Item, Double>();
int i = 0;
while (itr.hasNext() && i < 5) {
User u = itr.next();
/* for(Item e:map.get(to).keySet()){ */
for (Item e1 : map.get(u).keySet()) {
if (!map.get(to).containsKey(e1)) {
if (rec.containsKey(e1)) {
double sc = rec.get(e1);
rec.put(e1, sc + map.get(u).get(e1) * scores.get(to).get(u));
} else {
// System.out.println(scores);
rec.put(e1, map.get(u).get(e1) * scores.get(to).get(u));
}
// }
}
}
i++;
}
TreeMap<Item, Double> res = new TreeMap<Item, Double>(
new ValueComparator(rec));
res.putAll(rec);
int k=0;
for(Item d:res.keySet()){
System.out.println(d.getmTitle());
k++;
if(k==5){break;}
}
}
I am using nested HashMap and TreeMap in this example.But I am facing the below problem.
In the code above in the line
rec.put(e1, map.get(u).get(e1) * scores.get(u).get(to));
I am getting a NullPointerException, even though I am using the same HashMap's keyset to get the Values.

How to save the trained data in openimaj?

I'm working on a project which is about taking attendance of a class through the class video. I'm training the data when the program is running and it is taking a lot of time to train the data. Is there any way by which I can save the trained data and use directly in the program. Below is my code:
public static void main(String[] args) throws MalformedURLException, IOException, VideoCaptureException
{
FKEFaceDetector faceDetector = new FKEFaceDetector(new HaarCascadeDetector(40));
EigenFaceRecogniser<KEDetectedFace, Person> faceRecogniser = EigenFaceRecogniser.create(20, new RotateScaleAligner(), 1, DoubleFVComparison.CORRELATION, 0.9f);
final FaceRecognitionEngine<KEDetectedFace, Person> faceEngine = FaceRecognitionEngine.create(faceDetector, faceRecogniser);
Video<MBFImage> video;
//video = new VideoCapture(320, 100);
video = new XuggleVideo(new URL("file:///home/kamal/Videos/Samplevideo1.mp4"));
Person[] dataset = new Person[12];
dataset[0] = new Person("a");
dataset[1] = new Person("b");
dataset[2] = new Person("c");
dataset[3] = new Person("d");
dataset[4] = new Person("e");
dataset[5] = new Person("f");
dataset[6] = new Person("g");
dataset[7] = new Person("h");
dataset[8] = new Person("i");
dataset[9] = new Person("j");
dataset[10] = new Person("k");
dataset[11] = new Person("l");
int dcount;
for(int i = 0; i < 12; i++)
{
dcount = 0;
for(int j = 1; j <= 20 && dcount == 0; j++)
{
MBFImage mbfImage = ImageUtilities.readMBF(new URL("file:///home/kamal/Pictures/"+i+"/"+j+".png"));
FImage fimg = mbfImage.flatten();
List<KEDetectedFace> faces = faceEngine.getDetector().detectFaces(fimg);
if(faces.size() > 0)
{
faceEngine.train(faces.get(0), dataset[i]);
dcount++;
}
}
}
VideoDisplay<MBFImage> vd = VideoDisplay.createVideoDisplay(video);
vd.addVideoListener(new VideoDisplayListener<MBFImage>() {
public void afterUpdate(VideoDisplay<MBFImage> display) {
}
public void beforeUpdate(MBFImage frame)
{
FImage image = frame.flatten();
List<KEDetectedFace> faces = faceEngine.getDetector().detectFaces(image);
for(DetectedFace face : faces) {
frame.drawShape(face.getBounds(), RGBColour.RED);
try {
List<IndependentPair<KEDetectedFace, ScoredAnnotation<Person>>> rfaces = faceEngine.recogniseBest(face.getFacePatch());
ScoredAnnotation<Person> score = rfaces.get(0).getSecondObject();
if (score != null)
{
System.out.println("Mr. "+score.annotation+" is Present.");
}
else
{
System.out.println("Recognizing");
}
} catch (Exception e) {
}
}
}
});
}
Yes, just use the static methods in the org.openimaj.io.IOUtils class to write the faceEngine to disk once it's trained and read it back in again.

Weka - Naive Bayes always gives borderline results

I am trying to write a text classifier in Weka with Naive Bayes. I have a collection of Foursquare tips as training data with close to 500 of them marked as positive and approximately same marked as negative in an excel file. The input file has two columns with first one being the tip text and second one the marked polarity. I am using AFINN-111.txt to add an attribute to enhance the output. It calculates all the polar words in that tip and gives a final score of all the words. Here is my entire code:
public class DataReader {
static Map<String, Integer> affinMap = new HashMap<String, Integer>();
public List<List<Object>> createAttributeList() {
ClassLoader classLoader = getClass().getClassLoader();
initializeAFFINMap(classLoader);
File inputWorkbook = new File(classLoader
.getResource("Tip_dataset2.xls").getFile());
Workbook w;
Sheet sheet = null;
try {
w = Workbook.getWorkbook(inputWorkbook);
// Get the first sheet
sheet = w.getSheet(0);
} catch (Exception e) {
e.printStackTrace();
}
List<List<Object>> attributeList = new ArrayList<List<Object>>();
for (int i = 1; i < sheet.getRows(); i++) {
String tip = sheet.getCell(0, i).getContents();
tip = tip.replaceAll("'", "");
tip = tip.replaceAll("\"", "");
tip = tip.replaceAll("%", " percent");
tip = tip.replaceAll("#", " ATAUTHOR");
String polarity = getPolarity(sheet.getCell(1, i).getContents());
int affinScore = 0;
String[] arr = tip.split(" ");
for (int j = 0; j < arr.length; j++) {
if (affinMap.containsKey(arr[j].toLowerCase())) {
affinScore = affinScore
+ affinMap.get(arr[j].toLowerCase());
}
}
List<Object> attrs = new ArrayList<Object>();
attrs.add(tip);
attrs.add(affinScore);
attrs.add(polarity);
attributeList.add(attrs);
}
return attributeList;
}
private String getPolarity(String cell) {
if (cell.equalsIgnoreCase("positive")) {
return "positive";
} else {
return "negative";
}
}
private void initializeAFFINMap(ClassLoader classLoader) {
try {
InputStream stream = classLoader
.getResourceAsStream("AFINN-111.txt");
DataInputStream in = new DataInputStream(stream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String str;
while ((str = br.readLine()) != null) {
String[] array = str.split("\t");
affinMap.put(array[0], Integer.parseInt(array[1]));
}
in.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
List<List<Object>> attrList=new DataReader().createAttributeList();
new CreateTrainedModel().createTrainingData(attrList);
}
}
Here is the actual classifier class:
public class CreateTrainedModel {
public void createTrainingData(List<List<Object>> attrList)
throws Exception {
Attribute tip = new Attribute("tip", (FastVector) null);
Attribute affin = new Attribute("affinScore");
FastVector pol = new FastVector(2);
pol.addElement("positive");
pol.addElement("negative");
Attribute polaritycl = new Attribute("polarity", pol);
FastVector inputDataDesc = new FastVector(3);
inputDataDesc.addElement(tip);
inputDataDesc.addElement(affin);
inputDataDesc.addElement(polaritycl);
Instances dataSet = new Instances("dataset", inputDataDesc,
attrList.size());
// Set class index
dataSet.setClassIndex(2);
for (List<Object> onList : attrList) {
Instance in = new Instance(3);
in.setValue((Attribute) inputDataDesc.elementAt(0), onList.get(0)
.toString());
in.setValue((Attribute) inputDataDesc.elementAt(1),
Integer.parseInt(onList.get(1).toString()));
in.setValue((Attribute) inputDataDesc.elementAt(2), onList.get(2)
.toString());
dataSet.add(in);
}
Filter f = new StringToWordVector();
f.setInputFormat(dataSet);
dataSet = Filter.useFilter(dataSet, f);
Classifier model = (Classifier) new NaiveBayes();
try {
model.buildClassifier(dataSet);
} catch (Exception e1) { // TODO Auto-generated catch block
e1.printStackTrace();
}
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(
"FS-TipsNaiveBayes.model"));
oos.writeObject(model);
oos.flush();
oos.close();
FastVector fvWekaAttributes1 = new FastVector(3);
fvWekaAttributes1.addElement(tip);
fvWekaAttributes1.addElement(affin);
Instance in = new Instance(3);
in.setValue((Attribute) fvWekaAttributes1.elementAt(0),
"burger here is good");
in.setValue((Attribute) fvWekaAttributes1.elementAt(1), 0);
Instances testSet = new Instances("dataset", fvWekaAttributes1, 1);
in.setDataset(testSet);
double[] fDistribution = model.distributionForInstance(in);
System.out.println(fDistribution);
}
}
The problem I am facing is with any input the output distribution is always in the range of [0.52314376998377, 0.47685623001622995]. And it is always more towards the positive than the negative. These figures do not change drastically. Any idea what wrong am I doing?
I didn't read your code, but one thing I can say is that the AFFIN score is normalized between a certain range. If your output is more towards a positive range then you need to change your classification cost function, because it is overfitting your data.

Sharepoint 2010 custom webpart paging

I am trying to implement simple paging on my sharepoint webpart. I have a single news articles list which has some simple columns. I want to be able to have then five on a page and with some numerical paging at the bottom. I have gone through the net trying to understand splistitemcollectionposition but with no luck. If anyone can help please can you give me a simple code example or some guidanc
Many thanks
Chris
I would suggest using SPDataSource and a SPGridView, together they will implement paging and many other cool features with minimal or no code.
Use this a a guide for some of the classes/methods/properties you might need to use to get paging to work. Be aware that this code does not compile, i have just pulled together various code snippets that i have in my own list results framework, which includes paging, sorting, grouping and caching. It should be enough to get you started though.
public class PagedListResults : System.Web.UI.WebControls.WebParts.WebPart {
protected SPPagedGridView oGrid;
protected override void CreateChildControls() {
this.oGrid = new SPPagedGridView();
oGrid.AllowPaging = true;
oGrid.PageIndexChanging += new GridViewPageEventHandler(oGrid_PageIndexChanging);
oGrid.PagerTemplate = null; // Must be called after Controls.Add(oGrid)
oGrid.PagerSettings.Mode = PagerButtons.NumericFirstLast;
oGrid.PagerSettings.PageButtonCount = 3;
oGrid.PagerSettings.Position = PagerPosition.TopAndBottom;
base.CreateChildControls();
}
public override void DataBind() {
base.DataBind();
SPQuery q = new SPQuery();
q.RowLimit = (uint)info.PageSize;
if (!string.IsNullOrEmpty(info.PagingInfoData)) {
SPListItemCollectionPosition pos = new SPListItemCollectionPosition(info.PagingInfoData);
q.ListItemCollectionPosition = pos;
} else {
//1st page, dont need a position, and using a position breaks things
}
q.Query = info.Caml;
SPListItemCollection items = SPContext.Current.List.GetItems(q);
FilterInfo info = null;
string tmp = "<View></View>";
tmp = tmp.Replace("<View><Query>", string.Empty);
tmp = tmp.Replace("</Query></View>", string.Empty);
info.Caml = tmp;
info.PagingInfoData = string.Empty;
info.CurrentPage = oGrid.CurrentPageIndex;
info.PageSize = oGrid.PageSize;
if (oGrid.PageIndex == 0 || oGrid.CurrentPageIndex == 0) {
//do nothing
} else {
StringBuilder value = new StringBuilder();
value.Append("Paged=TRUE");
value.AppendFormat("&p_ID={0}", ViewState[KEY_PagingPrefix + "ID:" + oGrid.PageIndex]);
info.PagingInfoData = value.ToString();
}
int pagecount = (int)Math.Ceiling(items.Count / (double)oGrid.PageSize);
for (int i = 1; i < pagecount; i++) { //not always ascending index numbers
ResultItem item = items[(i * oGrid.PageSize) - 1];
ViewState[KEY_PagingPrefix + "ID:" + i] = item.ID;
}
oGrid.VirtualCount = items.Count;
DateTime time3 = DateTime.Now;
DataTable table = new DataTable("Data");
DataBindListData(table, items);
this.oGrid.DataSource = table;
this.oGrid.DataBind();
this.oGrid.PageIndex = oGrid.CurrentPageIndex; //need to reset this after DataBind
}
void oGrid_PageIndexChanging(object sender, GridViewPageEventArgs e) {
oGrid.PageIndex = e.NewPageIndex;
oGrid.CurrentPageIndex = oGrid.PageIndex;
}
}
public class FilterInfo {
public string Caml;
public string PagingInfoData;
public int CurrentPage;
public int PageSize;
}
public class SPPagedGridView : SPGridView {
protected override void InitializePager(GridViewRow row, int columnSpan, PagedDataSource pagedDataSource) {
pagedDataSource.AllowCustomPaging = true;
pagedDataSource.VirtualCount = virtualcount;
pagedDataSource.CurrentPageIndex = currentpageindex;
base.InitializePager(row, columnSpan, pagedDataSource);
}
private int virtualcount = 0;
public int VirtualCount {
get { return virtualcount; }
set { virtualcount = value; }
}
private int currentpageindex = 0;
public int CurrentPageIndex {
get { return currentpageindex; }
set { currentpageindex = value; }
}
}
check out my post on how to page using SPListItemCollectionPosition, I did a component to page over lists, maybe it can help -> http://hveiras.wordpress.com/2011/11/07/listpagert-using-splistitemcollectionposition/

Resources